From 9ac47eaa5d1d98b09f7b04cfc758ceda7d735f90 Mon Sep 17 00:00:00 2001
From: ningwanyi <sh8400947@qq.com>
Date: Thu, 11 May 2023 18:55:57 +0000
Subject: [PATCH 01/12] some evaluation

---
 benchmark.py |   2 +
 fmzip.py     |   0
 gptj.py      |   0
 opt.py       | 104 +++++++++++++++++++++++++++++----------------------
 opt_delta.py |  24 ++++++++----
 5 files changed, 79 insertions(+), 51 deletions(-)
 create mode 100644 benchmark.py
 create mode 100644 fmzip.py
 create mode 100644 gptj.py

diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000..acf2ea0
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,2 @@
+wbits = [2, 3, 4, 8]
+sparsity = [0.0, 0.5, 0.9]
\ No newline at end of file
diff --git a/fmzip.py b/fmzip.py
new file mode 100644
index 0000000..e69de29
diff --git a/gptj.py b/gptj.py
new file mode 100644
index 0000000..e69de29
diff --git a/opt.py b/opt.py
index edf40bc..77f0f9b 100644
--- a/opt.py
+++ b/opt.py
@@ -6,7 +6,7 @@
 from gptq import *
 from modelutils import *
 from quant import quantize, Quantizer, Quant3Linear, make_quant3
-
+from prettytable import PrettyTable
 def get_opt(model):
     import torch
     def skip(*args, **kwargs):
@@ -223,6 +223,7 @@ def forward(self, inp, **kwargs):
     print(ppl.item())
 
     model.config.use_cache = use_cache
+    return ppl.item()
 
 # TODO: perform packing on GPU
 def opt_pack3(model, quantizers):
@@ -351,6 +352,48 @@ def sync():
             print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
 
 
+def main(args):
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        model = get_opt(args.model)
+        model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = opt_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            opt_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+    
+
+    dataloader, testloader = get_loaders(
+        args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    print(dataset)
+    ppl = opt_eval(model, testloader, DEV)
+    
+    if args.save:
+        opt_pack3(model, quantizers)
+        torch.save(model.state_dict(), args.save)
+        
+    return ppl
+
+        
 if __name__ == '__main__':
     import argparse
     from datautils import *
@@ -358,11 +401,11 @@ def sync():
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        'model', type=str,
+        '--model', type=str, default='lnair/opt-1.3b-wikitext2',
         help='OPT model to load; pass `facebook/opt-X`.'
     )
     parser.add_argument(
-        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
+        '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
         help='Where to extract calibration data from.'
     )
     parser.add_argument(
@@ -427,44 +470,17 @@ def sync():
     )
 
     args = parser.parse_args()
-
-    if args.load:
-        model = load_quant3(args.model, args.load)
-    else:
-        model = get_opt(args.model)
-        model.eval()
-
-    dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-
-    if args.wbits < 16 and not args.nearest:
-        tick = time.time()
-        quantizers = opt_sequential(model, dataloader, DEV)
-        print(time.time() - tick)
-
-    if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            opt_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
-        if args.benchmark:
-            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
-            benchmark(model, input_ids, check=args.check)
-    if args.load:
-        exit()
-
-    datasets = ['wikitext2', 'ptb', 'c4'] 
-    if args.new_eval:
-      datasets = ['wikitext2', 'ptb-new', 'c4-new']
-    for dataset in datasets: 
-        dataloader, testloader = get_loaders(
-            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
-        )
-        print(dataset)
-        opt_eval(model, testloader, DEV)
-
-    if args.save:
-        opt_pack3(model, quantizers)
-        torch.save(model.state_dict(), args.save) 
+    
+    results = PrettyTable()
+    results.field_names = ['Bits', 'wiki', 'ptb', 'c4']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, ppls[0], ppls[1], ppls[2]])
+    print(results)
+    print('finished.')
diff --git a/opt_delta.py b/opt_delta.py
index f9f6bbf..82371a4 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -7,7 +7,7 @@
 from gptq import *
 from modelutils import *
 from quant import *
-
+from prettytable import PrettyTable
 import copy
 #from prettytable import PrettyTable
 
@@ -548,7 +548,9 @@ def main(args):
     if args.save:
         opt_pack3(model, quantizers)
         torch.save(model.state_dict(), args.save) 
-
+        
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return ppl, n_params, comp_time
 
 if __name__ == '__main__':
     import argparse
@@ -601,7 +603,7 @@ def main(args):
         help='Whether to perform symmetric quantization.'
     )
     parser.add_argument(
-        '--save', type=str, default='',
+        '--save', type=str, default='opt-1.3b-wikitext2-wbits2.pt',
         help='Save quantized checkpoint under this name.'
     )
     parser.add_argument(
@@ -647,8 +649,16 @@ def main(args):
     )
     args = parser.parse_args()
 
-    #results = PrettyTable()
-
-    main(args)
-    
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki', 'ptb', 'c4']
+    for n_bits in [2, 3, 4]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'opt-not-delta1.3b-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl, n_params, comp_time = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+    print(results)
     print('finished.')

From eca0c2e3215455d047d4c4bb5e98c70d632fd7c8 Mon Sep 17 00:00:00 2001
From: kumbong <kumbonghermann@gmail.com>
Date: Fri, 12 May 2023 08:38:37 +0000
Subject: [PATCH 02/12] experiment scripts + gptj

---
 .gitignore           |   5 +-
 evaluation.sh        |  52 +++++
 file_0.txt           |   7 +
 file_1.txt           |   7 +
 file_2.txt           |   7 +
 file_3.txt           |   7 +
 file_4.txt           |   7 +
 file_5.txt           |   7 +
 gptj.py              | 492 +++++++++++++++++++++++++++++++++++++++++++
 jt_datautils/cot.py  | 105 +++++++++
 jt_datautils/pile.py |  77 +++++++
 opt_delta.py         |  30 +--
 requirements.txt     |   8 +-
 src/fmzip            |   1 +
 14 files changed, 798 insertions(+), 14 deletions(-)
 create mode 100755 evaluation.sh
 create mode 100644 file_0.txt
 create mode 100644 file_1.txt
 create mode 100644 file_2.txt
 create mode 100644 file_3.txt
 create mode 100644 file_4.txt
 create mode 100644 file_5.txt
 create mode 100644 jt_datautils/cot.py
 create mode 100644 jt_datautils/pile.py
 create mode 160000 src/fmzip

diff --git a/.gitignore b/.gitignore
index dbd6338..761a9f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,7 @@ dist/
 .idea
 *.egg-info/
 *.safetensors
-outputs/
\ No newline at end of file
+outputs/
+.cache/
+data/
+results/
\ No newline at end of file
diff --git a/evaluation.sh b/evaluation.sh
new file mode 100755
index 0000000..aaea40f
--- /dev/null
+++ b/evaluation.sh
@@ -0,0 +1,52 @@
+CUDA_VISIBLE_DEVICES=0 python3 opt_delta.py \
+    --groupsize 1024 \
+    --delta \
+    --benchmark_results "file_0.txt" \
+&
+CUDA_VISIBLE_DEVICES=1 python3 opt_delta.py \
+    --groupsize 1024 \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.9 \
+    --delta \
+    --benchmark_results "file_1.txt" \
+&
+CUDA_VISIBLE_DEVICES=2 python3 opt_delta.py \
+    --groupsize 1024 \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 \
+    --delta \
+    --benchmark_results "file_2.txt" \
+&
+CUDA_VISIBLE_DEVICES=3 python3 opt_delta.py \
+    --groupsize 1024 \
+    --delta \
+    --rank 32 \
+    --benchmark_results "file_3.txt" \
+&
+CUDA_VISIBLE_DEVICES=4 python3 opt_delta.py \
+    --groupsize 1024 \
+    --delta \
+    --rank 16  \
+    --benchmark_results "file_4.txt" \
+&
+CUDA_VISIBLE_DEVICES=5 python3 opt_delta.py \
+    --groupsize 1024 \
+    --delta \
+    --rank 64 \
+    --benchmark_results "file_5.txt" \
+&
+CUDA_VISIBLE_DEVICES=6 python3 opt_delta.py \
+    --groupsize 1024 \
+    --delta \
+    --rank 32 \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.9 \
+    --benchmark_results "file_6.txt" \
+&
+CUDA_VISIBLE_DEVICES=7 python3 opt_delta.py \
+    --groupsize 1024 \
+    --delta \
+    --rank 32 \
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 \
+    --benchmark_results "file_7.txt" 
\ No newline at end of file
diff --git a/file_0.txt b/file_0.txt
new file mode 100644
index 0000000..7fd8537
--- /dev/null
+++ b/file_0.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+-------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb        |         c4         |
++------+-----------+--------------------+--------------------+-------------------+--------------------+
+|  2   | 107356160 | 315.56584095954895 | 12.70229721069336  | 18.99186134338379 | 16.049821853637695 |
+|  3   | 107356160 | 254.49543404579163 | 12.98267936706543  | 19.62110710144043 | 16.652606964111328 |
+|  4   | 107356160 | 285.25878047943115 | 12.996271133422852 | 19.65008544921875 | 16.664426803588867 |
++------+-----------+--------------------+--------------------+-------------------+--------------------+
\ No newline at end of file
diff --git a/file_1.txt b/file_1.txt
new file mode 100644
index 0000000..1af604f
--- /dev/null
+++ b/file_1.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 288.47421526908875 | 12.124371528625488 | 17.396339416503906 | 15.110072135925293 |
+|  3   | 107356160 | 310.34640645980835 | 12.246709823608398 | 17.316566467285156 | 14.97178840637207  |
+|  4   | 107356160 | 262.9206793308258  | 12.252873420715332 | 17.329992294311523 | 14.979094505310059 |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_2.txt b/file_2.txt
new file mode 100644
index 0000000..4ca10d2
--- /dev/null
+++ b/file_2.txt
@@ -0,0 +1,7 @@
++------+-----------+-------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time       |        wiki        |        ptb         |         c4         |
++------+-----------+-------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 289.8132817745209 | 13.843452453613281 | 16.968669891357422 | 14.779077529907227 |
+|  3   | 107356160 | 307.7978012561798 | 13.91087532043457  | 16.95600700378418  | 14.742414474487305 |
+|  4   | 107356160 | 262.0493402481079 | 13.913723945617676 | 16.955684661865234 | 14.743617057800293 |
++------+-----------+-------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_3.txt b/file_3.txt
new file mode 100644
index 0000000..672862a
--- /dev/null
+++ b/file_3.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 283.91542887687683 | 12.507635116577148 | 18.553525924682617 | 15.613986015319824 |
+|  3   | 107356160 | 287.85402369499207 | 12.571398735046387 | 18.915355682373047 | 15.952068328857422 |
+|  4   | 107356160 | 279.67540669441223 | 12.590620040893555 | 18.968795776367188 | 15.981791496276855 |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_4.txt b/file_4.txt
new file mode 100644
index 0000000..e2c3608
--- /dev/null
+++ b/file_4.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 274.58040595054626 | 12.96647834777832  | 18.44032859802246  | 15.488606452941895 |
+|  3   | 107356160 | 277.05651092529297 | 12.934049606323242 | 18.722591400146484 | 15.750381469726562 |
+|  4   | 107356160 | 282.69956731796265 | 12.932695388793945 | 18.789344787597656 | 15.76345443725586  |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/file_5.txt b/file_5.txt
new file mode 100644
index 0000000..2655a74
--- /dev/null
+++ b/file_5.txt
@@ -0,0 +1,7 @@
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
+|  2   | 107356160 | 262.34580183029175 | 12.373908996582031 | 18.664175033569336 | 15.718718528747559 |
+|  3   | 107356160 | 271.34020018577576 | 12.558426856994629 | 19.13666343688965  | 16.14783477783203  |
+|  4   | 107356160 | 255.5096390247345  | 12.59843921661377  | 19.159931182861328 | 16.166603088378906 |
++------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/gptj.py b/gptj.py
index e69de29..a4e34e7 100644
--- a/gptj.py
+++ b/gptj.py
@@ -0,0 +1,492 @@
+import time
+
+import torch
+import torch.nn as nn
+
+from gptq import *
+from modelutils import *
+from quant import quantize, Quantizer, Quant3Linear, make_quant3
+from prettytable import PrettyTable
+from transformers import GPTJForCausalLM
+
+def get_opt(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    model = GPTJForCausalLM.from_pretrained(
+    model,
+    revision="float16",
+    torch_dtype=torch.float16,
+    ).to("cuda")
+    model.seqlen = model.config.max_position_embeddings
+    return model
+
+@torch.no_grad()
+def opt_sequential(model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+            )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+            gptq[name].free()
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def opt_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+    return ppl.item()
+
+# TODO: perform packing on GPU
+def opt_pack3(model, quantizers):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant3(model, quantizers, faster=args.faster_kernel)
+    qlayers = find_layers(model, [Quant3Linear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name] = quantizers[name].cpu()
+        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
+    print('Done.')
+    return model
+
+def load_quant3(model, checkpoint):
+    from transformers import OPTConfig, OPTForCausalLM 
+    config = OPTConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop 
+    torch.nn.init.uniform_ = noop 
+    torch.nn.init.normal_ = noop 
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = OPTForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant3(model, layers, faster=args.faster_kernel)
+
+    print('Loading model ...')
+    model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = model.config.max_position_embeddings
+    print('Done.')
+
+    return model
+
+def opt_multigpu(model, gpus):
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
+    if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
+
+    cache = {'mask': None}
+
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+
+    layers = model.model.decoder.layers
+    pergpu = math.ceil(len(layers) / len(gpus))
+    for i in range(len(layers)):
+        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+
+    model.gpus = gpus
+
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.model.decoder.layers):
+        layer.register_forward_hook(clear_past(i))
+
+    print('Benchmarking ...')
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(
+                input_ids[:, i].reshape(-1),
+                past_key_values=cache['past'],
+                attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
+            )
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_key_values)
+            del out
+        sync()
+        import numpy as np
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+
+
+def main(args):
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        model = get_opt(args.model)
+        model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = opt_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            opt_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+    
+
+    dataloader, testloader = get_loaders(
+        args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    print(dataset)
+    ppl = opt_eval(model, testloader, DEV)
+    
+    if args.save:
+        opt_pack3(model, quantizers)
+        torch.save(model.state_dict(), args.save)
+        
+    return ppl
+
+        
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--model', type=str, default='lnair/opt-1.3b-wikitext2',
+        help='OPT model to load; pass `facebook/opt-X`.'
+    )
+    parser.add_argument(
+        '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    ) 
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--trits', action='store_true',
+        help='Whether to use trits for quantization.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--sym', action='store_true',
+        help='Whether to perform symmetric quantization.'
+    )
+    parser.add_argument(
+        '--save', type=str, default='',
+        help='Save quantized checkpoint under this name.'
+    )
+    parser.add_argument(
+        '--load', type=str, default='',
+        help='Load quantized model.'
+    )
+    parser.add_argument(
+        '--benchmark', type=int, default=0,
+        help='Number of tokens to use for benchmarking.'
+    )
+    parser.add_argument(
+        '--check', action='store_true',
+        help='Whether to compute perplexity during benchmarking for verification.'
+    )
+    parser.add_argument(
+        '--new-eval', action='store_true',
+        help='Whether to use the new PTB and C4 eval.'
+    )
+    parser.add_argument(
+        '--faster-kernel', action='store_true',
+        help='Whether to use the new faster kernel for benchmarking.'
+    )
+    parser.add_argument(
+        '--act-order', action='store_true',
+        help='Whether to apply the activation order GPTQ heuristic'
+    )
+
+    args = parser.parse_args()
+    
+    get_opt("EleutherAI/gpt-j-6B")
+    # results = PrettyTable()
+    # results.field_names = ['Bits', 'wiki', 'ptb', 'c4']
+    # for n_bits in [4, 3, 2]:
+    #     ppls = []
+    #     for dataset in ['wikitext2', 'ptb', 'c4']:
+    #         args.dataset = dataset
+    #         args.wbits = n_bits
+    #         args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits)
+    #         ppl = main(args)
+    #         ppls.append(ppl)
+    #     results.add_row([n_bits, ppls[0], ppls[1], ppls[2]])
+    # print(results)
+    print('finished.')
diff --git a/jt_datautils/cot.py b/jt_datautils/cot.py
new file mode 100644
index 0000000..15796e9
--- /dev/null
+++ b/jt_datautils/cot.py
@@ -0,0 +1,105 @@
+import os
+import re
+import torch
+import json
+from torch.utils.data import IterableDataset, DataLoader
+from itertools import cycle, islice
+import random
+from datasets import Dataset
+from datasets import load_dataset, load_from_disk
+#from comm.comm_utils import *
+
+
+
+class StreamDataset(IterableDataset):
+    def __init__(self, cot_data_path, tokenizer, seq_length=1024):
+        
+        self.cot_data_path = cot_data_path
+        
+        with open(cot_data_path) as f:
+            self.cot_data = json.load(f)
+        
+        self.buffer_tokens = []
+        
+        self.tokenizer = tokenizer
+        self.seq_length = seq_length
+        
+        self.it = None
+        
+    def state_dict(self):
+        return {}
+    
+    def load_state_dict(self, state_dict):
+        pass
+    
+    def get_sequence_from_cot(self):
+        
+        while True:
+            
+            keys = list(self.cot_data.keys())
+            random.shuffle(keys)
+            
+            input_ids = []
+            
+            for k in keys:
+                
+                v = self.cot_data[k]
+                
+                input_ids += self.tokenizer(v + '\n\n')['input_ids']
+                if len(input_ids) < self.seq_length:
+                    continue
+                #     input_ids += [self.tokenizer.eos_token_id]*(self.seq_length - len(input_ids))
+                
+                input_ids = input_ids[:self.seq_length]
+                input_ids = torch.tensor(input_ids).long()
+                
+                yield input_ids
+                
+                input_ids = []
+        
+    def get_sequence(self):
+        
+        it_cot = cycle(self.get_sequence_from_cot())
+        
+        while True:
+            
+            input_ids = next(it_cot)
+                
+
+            yield {
+                'input_ids': input_ids,
+            }
+            
+                
+    def get_stream(self):
+        return cycle(self.get_sequence())
+    
+    def __iter__(self):
+        if self.it is None:
+            self.it = self.get_stream()
+        return self.it
+    
+    
+    
+def get_cot_train_data_loader(args, tokenizer, num_workers=0, state_dict=None):
+    
+    stream_dataset = StreamDataset(
+        './data/mmlu-cot.json',
+        tokenizer=tokenizer, seq_length=args.seq_length
+    )
+    
+    if state_dict is not None:
+        stream_dataset.load_state_dict(state_dict)
+    
+    train_data_loader = torch.utils.data.DataLoader(stream_dataset,
+                                                    batch_size=args.batch_size * args.data_group_size,
+                                                    shuffle=False,
+                                                    num_workers=num_workers,
+                                                    pin_memory=True,
+                                                    collate_fn=None)
+    return train_data_loader
+
+def get_cot_ds(data_path, tokenizer, seq_length):
+    return StreamDataset(os.path.join(data_path,'mmlu-cot.json'),
+        tokenizer=tokenizer, seq_length=seq_length
+    )
\ No newline at end of file
diff --git a/jt_datautils/pile.py b/jt_datautils/pile.py
new file mode 100644
index 0000000..fbddca5
--- /dev/null
+++ b/jt_datautils/pile.py
@@ -0,0 +1,77 @@
+
+import os
+import re
+import torch
+from torch.utils.data import IterableDataset, DataLoader
+from itertools import cycle, islice
+import random
+from datasets import Dataset
+from datasets import load_dataset, load_from_disk
+# from comm.comm_utils import *
+
+
+class StreamDataset(IterableDataset):
+    default_doc_separator = ''
+    def __init__(self, data, tokenizer, seq_length=1024, doc_separator=None):
+        self.data = data
+        self.tokenizer = tokenizer
+        self.seq_length = seq_length
+        self.doc_separator = doc_separator or StreamDataset.default_doc_separator
+        self.it = None
+        self.iter_count = 0
+        self.buffer_tokens = []
+        
+    def state_dict(self):
+        return {
+            'iter_count': self.iter_count,
+            'buffer_tokens': self.buffer_tokens,
+        }
+    
+    def load_state_dict(self, state_dict):
+        self.iter_count = state_dict['iter_count']
+        self.buffer_tokens = state_dict['buffer_tokens']
+        self.data = self.data.skip(self.iter_count)
+        
+    def get_sequence(self):
+        buffer_tokens = self.buffer_tokens
+        for x in self.data:
+            self.iter_count += 1
+            curr_tokens = self.tokenizer(self.doc_separator + x['text'])['input_ids']
+            buffer_tokens += curr_tokens
+            while len(buffer_tokens) >= self.seq_length:
+                tokens = buffer_tokens[:self.seq_length]
+                buffer_tokens = buffer_tokens[self.seq_length:]
+                input_ids = torch.tensor(tokens)
+                self.buffer_tokens = buffer_tokens # update for restore
+                yield {
+                    'input_ids': input_ids,
+                }
+                
+    def get_stream(self):
+        return cycle(self.get_sequence())
+    
+    def __iter__(self):
+        if self.it is None:
+            self.it = self.get_stream()
+        return self.it
+        
+    
+def get_pile_train_data_loader(args, tokenizer, num_workers=0, state_dict=None):
+    
+    data = load_dataset('the_pile', split="train", streaming=True).shuffle(buffer_size=10_000, seed=args.seed)
+    stream_dataset = StreamDataset(data, tokenizer, args.seq_length)
+    
+    if state_dict is not None:
+        stream_dataset.load_state_dict(state_dict)
+    
+    train_data_loader = torch.utils.data.DataLoader(stream_dataset,
+                                                    batch_size=args.batch_size * args.data_group_size,
+                                                    shuffle=False,
+                                                    num_workers=num_workers,
+                                                    pin_memory=True,
+                                                    collate_fn=None)
+    return train_data_loader
+
+def get_pile_ds(tokenizer, seq_length):
+    data = load_dataset("the_pile", split="train", streaming=True)
+    return StreamDataset(data, tokenizer, seq_length)
\ No newline at end of file
diff --git a/opt_delta.py b/opt_delta.py
index 82371a4..481cc9a 100644
--- a/opt_delta.py
+++ b/opt_delta.py
@@ -9,6 +9,7 @@
 from quant import *
 from prettytable import PrettyTable
 import copy
+import os
 #from prettytable import PrettyTable
 
 def get_opt(model):
@@ -539,17 +540,16 @@ def main(args):
         dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
     )
     
-    ppl = opt_eval(model, testloader, DEV)
-    print(ppl)
+    # ppl = opt_eval(model, testloader, DEV)
+    # print(ppl)
 
     if args.rank > 0:
-        print("Number of params without low rank ", num_params)
-        print("Number of params with low rank", num_params - num_params_saved_lr)
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Number of params without low rank ", n_params)
+        print("Number of params with low rank", n_params - num_params_saved_lr)
     if args.save:
         opt_pack3(model, quantizers)
         torch.save(model.state_dict(), args.save) 
-        
-    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
     return ppl, n_params, comp_time
 
 if __name__ == '__main__':
@@ -610,6 +610,10 @@ def main(args):
         '--load', type=str, default='',
         help='Load quantized model.'
     )
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
+    )
     parser.add_argument(
         '--benchmark', type=int, default=0,
         help='Number of tokens to use for benchmarking.'
@@ -650,15 +654,17 @@ def main(args):
     args = parser.parse_args()
 
     results = PrettyTable()
-    results.field_names = ['Bits', 'n_params', 'Time', 'wiki', 'ptb', 'c4']
-    for n_bits in [2, 3, 4]:
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [2]:
         ppls = []
-        for dataset in ['wikitext2', 'ptb', 'c4']:
+        for dataset in ['wikitext2']:
             args.dataset = dataset
             args.wbits = n_bits
             args.save = 'opt-not-delta1.3b-%s-wbits%d.pt' % (dataset, n_bits)
             ppl, n_params, comp_time = main(args)
-            ppls.append(ppl)
-        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
-    print(results)
+        #     ppls.append(ppl)
+        # results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        # print(results)
+        # with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+        #     f.write(str(results))
     print('finished.')
diff --git a/requirements.txt b/requirements.txt
index 7417000..b5c4f04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,9 @@
 transformers
 loguru
-datasets
\ No newline at end of file
+datasets
+safetensors==0.3.0
+datasets==2.10.1
+sentencepiece
+git+https://github.com/huggingface/transformers
+ninja
+prettytable
\ No newline at end of file
diff --git a/src/fmzip b/src/fmzip
new file mode 160000
index 0000000..b41e785
--- /dev/null
+++ b/src/fmzip
@@ -0,0 +1 @@
+Subproject commit b41e7856f092c80286577b2eb5e1294a764099d6

From 9d7744771d1c04710bf0fe51d69bd23fdcb6e537 Mon Sep 17 00:00:00 2001
From: Fluidstack User <fsuser@cloudvm.example.com>
Date: Fri, 12 May 2023 17:30:51 +0000
Subject: [PATCH 03/12] gptj

---
 gptj.py       | 308 +++++++++++++++++++++-----------------------------
 gptq.py       |  44 +++++---
 modelutils.py |  18 +--
 quant.py      | 114 ++-----------------
 4 files changed, 171 insertions(+), 313 deletions(-)

diff --git a/gptj.py b/gptj.py
index a4e34e7..a41f0ff 100644
--- a/gptj.py
+++ b/gptj.py
@@ -1,43 +1,35 @@
 import time
+import math
 
 import torch
 import torch.nn as nn
+import transformers
 
 from gptq import *
 from modelutils import *
-from quant import quantize, Quantizer, Quant3Linear, make_quant3
-from prettytable import PrettyTable
-from transformers import GPTJForCausalLM
+from quant import *
 
-def get_opt(model):
+def get_gptj(model):
     import torch
     def skip(*args, **kwargs):
         pass
     torch.nn.init.kaiming_uniform_ = skip
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
-    model = GPTJForCausalLM.from_pretrained(
-    model,
-    revision="float16",
-    torch_dtype=torch.float16,
-    ).to("cuda")
-    model.seqlen = model.config.max_position_embeddings
+    from transformers import GPTJForCausalLM
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = 2048
     return model
 
 @torch.no_grad()
-def opt_sequential(model, dataloader, dev):
+def gptj_sequential(model, dataloader, dev, means=None, stds=None):
     print('Starting ...')
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers = model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
 
     dtype = next(iter(model.parameters())).dtype
@@ -63,13 +55,10 @@ def forward(self, inp, **kwargs):
             pass
     layers[0] = layers[0].module
 
+    layers = model.transformer.h
     layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
     torch.cuda.empty_cache()
 
     outs = torch.zeros_like(inps)
@@ -87,9 +76,9 @@ def forward(self, inp, **kwargs):
             gptq[name] = GPTQ(subset[name])
             gptq[name].quantizer = Quantizer()
             gptq[name].quantizer.configure(
-                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+                args.wbits, perchannel=True, sym=False, mse=False
             )
-
+        
         def add_batch(name):
             def tmp(_, inp, out):
                 gptq[name].add_batch(inp[0].data, out.data)
@@ -105,25 +94,24 @@ def tmp(_, inp, out):
         for name in subset:
             print(i, name)
             print('Quantizing ...')
-            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
-            gptq[name].free()
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
 
         layers[i] = layer.cpu()
         del layer
-        del gptq 
+        del gptq
         torch.cuda.empty_cache()
 
         inps, outs = outs, inps
 
     model.config.use_cache = use_cache
-    
+
     return quantizers
 
+
 @torch.no_grad()
-def opt_eval(model, testenc, dev):
+def gptj_eval(model, testenc, dev):
     print('Evaluating ...')
 
     testenc = testenc.input_ids
@@ -131,14 +119,9 @@ def opt_eval(model, testenc, dev):
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
+    layers = model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
 
     dtype = next(iter(model.parameters())).dtype
@@ -153,27 +136,24 @@ def __init__(self, module):
             self.module = module
         def forward(self, inp, **kwargs):
             inps[cache['i']] = inp
-            cache['i'] += 1
+            cache ['i'] += 1
             cache['attention_mask'] = kwargs['attention_mask']
             raise ValueError
     layers[0] = Catcher(layers[0])
     for i in range(nsamples):
-        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
         try:
             model(batch)
         except ValueError:
             pass
     layers[0] = layers[0].module
 
+    layers = model.transformer.h
     layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
     torch.cuda.empty_cache()
-
+    
     outs = torch.zeros_like(inps)
     attention_mask = cache['attention_mask']
 
@@ -182,11 +162,11 @@ def forward(self, inp, **kwargs):
         layer = layers[i].to(dev)
 
         if args.nearest:
-            subset = find_layers(layer)
+            subset  = find_layers(layer)
             for name in subset:
                 quantizer = Quantizer()
                 quantizer.configure(
-                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                    args.wbits, perchannel=True, sym=False, mse=False
                 )
                 W = subset[name].weight.data
                 quantizer.find_params(W, weight=True)
@@ -201,20 +181,14 @@ def forward(self, inp, **kwargs):
         torch.cuda.empty_cache()
         inps, outs = outs, inps
 
-    if model.model.decoder.final_layer_norm is not None:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
-    if model.model.decoder.project_out is not None:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.transformer.ln_f = model.transformer.ln_f.to(dev)
     model.lm_head = model.lm_head.to(dev)
-
+    
     testenc = testenc.to(dev)
     nlls = []
     for i in range(nsamples):
         hidden_states = inps[i].unsqueeze(0)
-        if model.model.decoder.final_layer_norm is not None:
-            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
-        if model.model.decoder.project_out is not None:
-            hidden_states = model.model.decoder.project_out(hidden_states)
+        hidden_states = model.transformer.ln_f(hidden_states)
         lm_logits = model.lm_head(hidden_states)
         shift_logits = lm_logits[:, :-1, :].contiguous()
         shift_labels = testenc[
@@ -226,61 +200,60 @@ def forward(self, inp, **kwargs):
         nlls.append(neg_log_likelihood)
     ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
     print(ppl.item())
+    
 
     model.config.use_cache = use_cache
-    return ppl.item()
 
-# TODO: perform packing on GPU
-def opt_pack3(model, quantizers):
+def gptj_pack(model, quantizers, wbits, groupsize):
     layers = find_layers(model)
     layers = {n: layers[n] for n in quantizers}
-    make_quant3(model, quantizers, faster=args.faster_kernel)
-    qlayers = find_layers(model, [Quant3Linear])
+    make_quant(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [QuantLinear])
     print('Packing ...')
     for name in qlayers:
         print(name)
-        quantizers[name] = quantizers[name].cpu()
-        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
-    print('Done.')
+        quantizers[name],scale,zero = quantizers[name]
+        quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
+        qlayers[name].pack(layers[name], scale, zero)
+    print('Done!')
     return model
 
-def load_quant3(model, checkpoint):
-    from transformers import OPTConfig, OPTForCausalLM 
-    config = OPTConfig.from_pretrained(model)
+def load_quant(model, checkpoint, wbits, groupsize):
+    from transformers import GPTJConfig, GPTJForCausalLM
+    config = GPTJConfig.from_pretrained(model)
     def noop(*args, **kwargs):
         pass
-    torch.nn.init.kaiming_uniform_ = noop 
-    torch.nn.init.uniform_ = noop 
-    torch.nn.init.normal_ = noop 
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
 
     torch.set_default_dtype(torch.half)
     transformers.modeling_utils._init_weights = False
     torch.set_default_dtype(torch.half)
-    model = OPTForCausalLM(config)
+    model = GPTJForCausalLM(config)
     torch.set_default_dtype(torch.float)
     model = model.eval()
     layers = find_layers(model)
-    for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
+    for name in ['lm_head']:
         if name in layers:
             del layers[name]
-    make_quant3(model, layers, faster=args.faster_kernel)
+    make_quant(model, layers, wbits, groupsize)
 
     print('Loading model ...')
-    model.load_state_dict(torch.load(checkpoint))
-    model.seqlen = model.config.max_position_embeddings
-    print('Done.')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done!')
 
     return model
 
-def opt_multigpu(model, gpus):
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
-    if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
+def gptj_multigpu(model, gpus):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[-1])
     import copy
     model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
 
@@ -289,19 +262,19 @@ def opt_multigpu(model, gpus):
     class MoveModule(nn.Module):
         def __init__(self, module):
             super().__init__()
-            self.module = module
+            self_module = module
             self.dev = next(iter(self.module.parameters())).device
         def forward(self, *inp, **kwargs):
             inp = list(inp)
             if inp[0].device != self.dev:
                 inp[0] = inp[0].to(self.dev)
-            if cache['mask'] is None or cache['mask'].device != self.dev:
+            if cache['mask'] is None or cache ['mask'].device != self.dev:
                 cache['mask'] = kwargs['attention_mask'].to(self.dev)
             kwargs['attention_mask'] = cache['mask']
             tmp = self.module(*inp, **kwargs)
             return tmp
 
-    layers = model.model.decoder.layers
+    layers = model.model.layers
     pergpu = math.ceil(len(layers) / len(gpus))
     for i in range(len(layers)):
         layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
@@ -318,7 +291,7 @@ def tmp(layer, inp, out):
             if cache['past']:
                 cache['past'][i] = None
         return tmp
-    for i, layer in enumerate(model.model.decoder.layers):
+    for i, layer in enumerate(model.model.layers):
         layer.register_forward_hook(clear_past(i))
 
     print('Benchmarking ...')
@@ -333,72 +306,35 @@ def sync():
                 torch.cuda.synchronize(gpu)
         else:
             torch.cuda.synchronize()
+    max_memory = 0
     with torch.no_grad():
         attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
         times = []
         for i in range(input_ids.numel()):
             tick = time.time()
             out = model(
-                input_ids[:, i].reshape(-1),
+                input_ids[:, i:i+1],
                 past_key_values=cache['past'],
                 attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
             )
             sync()
             times.append(time.time() - tick)
             print(i, times[-1])
+            max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024)
             if check and i != input_ids.numel() - 1:
                 tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
-            cache['past'] = list(out.past_key_values)
+            cache['past'] = list(out.past_keys_values)
             del out
         sync()
         import numpy as np
         print('Median:', np.median(times))
         if check:
             print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):',max_memory)
 
 
-def main(args):
-    if args.load:
-        model = load_quant3(args.model, args.load)
-    else:
-        model = get_opt(args.model)
-        model.eval()
-
-    dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-
-    if args.wbits < 16 and not args.nearest:
-        tick = time.time()
-        quantizers = opt_sequential(model, dataloader, DEV)
-        print(time.time() - tick)
-
-    if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            opt_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
-        if args.benchmark:
-            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
-            benchmark(model, input_ids, check=args.check)
-    if args.load:
-        exit()
-    
-
-    dataloader, testloader = get_loaders(
-        args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
-    )
-    print(dataset)
-    ppl = opt_eval(model, testloader, DEV)
-    
-    if args.save:
-        opt_pack3(model, quantizers)
-        torch.save(model.state_dict(), args.save)
         
-    return ppl
 
-        
 if __name__ == '__main__':
     import argparse
     from datautils import *
@@ -406,11 +342,11 @@ def main(args):
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        '--model', type=str, default='lnair/opt-1.3b-wikitext2',
-        help='OPT model to load; pass `facebook/opt-X`.'
+        '--model', type=str, default='EleutherAI/gpt-j-6b',
+        help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
     )
     parser.add_argument(
-        '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
+        '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'],
         help='Where to extract calibration data from.'
     )
     parser.add_argument(
@@ -428,30 +364,26 @@ def main(args):
     parser.add_argument(
         '--nearest', action='store_true',
         help='Whether to run the RTN baseline.'
-    ) 
+    )
     parser.add_argument(
         '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
         help='#bits to use for quantization; use 16 for evaluating base model.'
     )
-    parser.add_argument(
-        '--trits', action='store_true',
-        help='Whether to use trits for quantization.'
-    )
     parser.add_argument(
         '--groupsize', type=int, default=-1,
         help='Groupsize to use for quantization; default uses full row.'
     )
     parser.add_argument(
-        '--sym', action='store_true',
-        help='Whether to perform symmetric quantization.'
+        '--save', type=str, default='',
+        help='Save the quantized GPT-J model under this name.'
     )
     parser.add_argument(
-        '--save', type=str, default='',
-        help='Save quantized checkpoint under this name.'
+        '--save_safetensors', type=str, default='',
+        help='Save the quantized GPT-J model as a  `.safetensors` ckpt'
     )
     parser.add_argument(
         '--load', type=str, default='',
-        help='Load quantized model.'
+        help='Load the quantized GPT-J model'
     )
     parser.add_argument(
         '--benchmark', type=int, default=0,
@@ -459,34 +391,56 @@ def main(args):
     )
     parser.add_argument(
         '--check', action='store_true',
-        help='Whether to compute perplexity during benchmarking for verification.'
-    )
-    parser.add_argument(
-        '--new-eval', action='store_true',
-        help='Whether to use the new PTB and C4 eval.'
-    )
-    parser.add_argument(
-        '--faster-kernel', action='store_true',
-        help='Whether to use the new faster kernel for benchmarking.'
-    )
-    parser.add_argument(
-        '--act-order', action='store_true',
-        help='Whether to apply the activation order GPTQ heuristic'
+        help='Whether to compute perpexity during benchmarking for verification.'
     )
 
+
     args = parser.parse_args()
-    
-    get_opt("EleutherAI/gpt-j-6B")
-    # results = PrettyTable()
-    # results.field_names = ['Bits', 'wiki', 'ptb', 'c4']
-    # for n_bits in [4, 3, 2]:
-    #     ppls = []
-    #     for dataset in ['wikitext2', 'ptb', 'c4']:
-    #         args.dataset = dataset
-    #         args.wbits = n_bits
-    #         args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits)
-    #         ppl = main(args)
-    #         ppls.append(ppl)
-    #     results.add_row([n_bits, ppls[0], ppls[1], ppls[2]])
-    # print(results)
-    print('finished.')
+
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+
+    if args.load:
+        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
+    else:
+        model = get_gptj(args.model)
+        model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if not args.load and args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = gptj_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            gptj_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+
+    for dataset in ['wikitext2', 'ptb', 'c4']:
+        dataloader, testloader = get_loaders(
+            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+        )
+        print(dataset)
+        gptj_eval(model, testloader, DEV)
+
+
+    if args.save:
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save)
+
+    if args.save_safetensors:
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        from safetensors.torch import save_file as safe_save
+        safe_save(model.state_dict(), args.save_safetensors)
\ No newline at end of file
diff --git a/gptq.py b/gptq.py
index 2477cac..b4546cc 100644
--- a/gptq.py
+++ b/gptq.py
@@ -1,16 +1,19 @@
 import math
 import time
+
 import torch
-import transformers
 import torch.nn as nn
+import transformers
+
+from quant import *
 
-from quant import quantize
 
 DEBUG = False 
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
+
 class GPTQ:
     def __init__(self, layer):
         self.layer = layer
@@ -54,7 +57,7 @@ def add_batch(self, inp, out):
         self.H += inp.matmul(inp.t())
 
     def fasterquant(
-        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False
+        self, blocksize=128, percdamp=.01, groupsize=-1
     ):
         W = self.layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -74,11 +77,6 @@ def fasterquant(
         H[dead, dead] = 1
         W[:, dead] = 0
 
-        if actorder:
-            perm = torch.argsort(torch.diag(H), descending=True)
-            W = W[:, perm]
-            H = H[perm][:, perm]
-
         Losses = torch.zeros_like(W)
         Q = torch.zeros_like(W)
 
@@ -89,6 +87,10 @@ def fasterquant(
         H = torch.cholesky_inverse(H)
         H = torch.linalg.cholesky(H, upper=True)
         Hinv = H
+        
+        scale = []
+        zero = []
+        now_idx = 1
 
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
@@ -107,6 +109,11 @@ def fasterquant(
                 if groupsize != -1:
                     if (i1 + i) % groupsize == 0:
                         self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
+                    
+                    if ((i1 + i) // groupsize) - now_idx == -1:
+                        scale.append(self.quantizer.scale)
+                        zero.append(self.quantizer.zero)
+                        now_idx += 1
 
                 q = quantize(
                     w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
@@ -130,21 +137,22 @@ def fasterquant(
                 print(torch.sum(Losses))
 
         torch.cuda.synchronize()
-        total_time = time.time() - tick
-        # print('time %.2f' % total_time)
-        error = torch.sum(Losses).item()
-        # print('error', error)
-
-        if actorder:
-            invperm = torch.argsort(perm)
-            Q = Q[:, invperm]
+        print('time %.2f' % (time.time() - tick))
+        print('error', torch.sum(Losses).item())
 
         if isinstance(self.layer, transformers.Conv1D):
             Q = Q.t()
         self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         if DEBUG:
             print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-
+            
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale,dim=1)
+        zero = torch.cat(zero,dim=1)
+        return scale,zero
+            
     def free(self):
         if DEBUG:
             self.inp1 = None
@@ -152,4 +160,4 @@ def free(self):
         self.H = None
         self.Losses = None
         self.Trace = None
-        torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
\ No newline at end of file
diff --git a/modelutils.py b/modelutils.py
index c93410d..5b36877 100644
--- a/modelutils.py
+++ b/modelutils.py
@@ -1,8 +1,10 @@
 import torch
 import torch.nn as nn
-from transformers import OPTForCausalLM
+
+
 DEV = torch.device('cuda:0')
 
+
 def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
     if type(module) in layers:
         return {name: module}
@@ -11,16 +13,4 @@ def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
         res.update(find_layers(
             child, layers=layers, name=name + '.' + name1 if name != '' else name1
         ))
-    return res
-
-def get_opt(model):
-    def skip(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = skip
-    torch.nn.init.uniform_ = skip
-    torch.nn.init.normal_ = skip
-    
-    # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
-    model.seqlen = model.config.max_position_embeddings
-    return model
\ No newline at end of file
+    return res
\ No newline at end of file
diff --git a/quant.py b/quant.py
index f8cc1b7..fe58148 100644
--- a/quant.py
+++ b/quant.py
@@ -1,11 +1,9 @@
-import math
 import numpy as np
 import torch
 import torch.nn as nn
+import math
 
 def quantize(x, scale, zero, maxq):
-    if maxq < 0:
-        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
     q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
     return scale * (q - zero)
 
@@ -18,11 +16,10 @@ def __init__(self, shape=1):
         self.register_buffer('zero', torch.zeros(shape))
 
     def configure(
-        self,
-        bits, perchannel=False, sym=True, 
-        mse=False, norm=2.4, grid=100, maxshrink=.8,
-        trits=False
-    ):
+            self,
+            bits, perchannel=False, sym=True, 
+            mse=False, norm=2.4, grid=100, maxshrink=.8
+        ):
         self.maxq = torch.tensor(2 ** bits - 1)
         self.perchannel = perchannel
         self.sym = sym
@@ -30,8 +27,6 @@ def configure(
         self.norm = norm
         self.grid = grid
         self.maxshrink = maxshrink 
-        if trits:
-            self.maxq = torch.tensor(-1) 
 
     def find_params(self, x, weight=False):
         dev = x.device
@@ -65,15 +60,11 @@ def find_params(self, x, weight=False):
         xmin[tmp] = -1
         xmax[tmp] = +1
 
-        if self.maxq < 0:
-          self.scale = xmax
-          self.zero = xmin
+        self.scale = (xmax - xmin) / self.maxq
+        if self.sym:
+            self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
         else:
-          self.scale = (xmax - xmin) / self.maxq
-          if self.sym:
-              self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
-          else:
-              self.zero = torch.round(-xmin / self.scale)
+            self.zero = torch.round(-xmin / self.scale)
 
         if self.mse:
             best = torch.full([x.shape[0]], float('inf'), device=dev)
@@ -133,91 +124,6 @@ def ready(self):
 except:
     print('CUDA extension not installed.')
 
-# Assumes layer is perfectly divisible into 1024 * 1024 blocks
-class Quant3Linear(nn.Module): 
-
-    def __init__(self, infeatures, outfeatures, faster=False):
-        super().__init__()
-        self.register_buffer('zeros', torch.zeros((outfeatures, 1)))
-        self.register_buffer('scales', torch.zeros((outfeatures, 1)))
-        self.register_buffer('bias', torch.zeros(outfeatures))
-        self.register_buffer(
-            'qweight', torch.zeros((infeatures // 32 * 3, outfeatures), dtype=torch.int)
-        )
-        self.faster = faster
-
-    def pack(self, linear, scales, zeros):
-        self.zeros = zeros * scales
-        self.scales = scales.clone()
-        self.bias = linear.bias.clone()
-
-        intweight = torch.round((linear.weight.data + self.zeros) / self.scales).to(torch.int)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * 3, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            for j in range(i, i + 10):
-                qweight[row] |= intweight[j] << (3 * (j - i))
-            i += 10
-            qweight[row] |= intweight[i] << 30
-            row += 1
-            qweight[row] |= (intweight[i] >> 2) & 1
-            i += 1
-            for j in range(i, i + 10):
-                qweight[row] |= intweight[j] << (3 * (j - i) + 1)
-            i += 10
-            qweight[row] |= intweight[i] << 31
-            row += 1
-            qweight[row] |= (intweight[i] >> 1) & 0x3
-            i += 1
-            for j in range(i, i + 10):
-                qweight[row] |= intweight[j] << (3 * (j - i) + 2)
-            i += 10
-            row += 1
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight) 
-
-    def forward(self, x):
-        if x.shape[-1] == x.numel():
-            outshape = list(x.shape)
-            y = self.bias.clone()
-            outshape[-1] = self.bias.numel()
-            dtype = x.dtype
-            if self.faster:
-                x = x.half()
-                quant_cuda.vecquant3matmul_faster(x, self.qweight, y, self.scales, self.zeros)
-            else:
-                x = x.float()
-                quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.zeros)
-            y = y.to(dtype)
-            return y.reshape(outshape)
-        raise ValueError('Only supports a single token currently.')
-
-def make_quant3(module, names, name='', faster=False):
-    if isinstance(module, Quant3Linear):
-        return
-    for attr in dir(module):
-        tmp = getattr(module, attr)
-        name1 = name + '.' + attr if name != '' else attr
-        if name1 in names:
-            setattr(
-                module, attr, Quant3Linear(tmp.in_features, tmp.out_features, faster=faster)
-            )
-    for name1, child in module.named_children():
-        make_quant3(child, names, name + '.' + name1 if name != '' else name1, faster=faster)
-
-def make_quant_lr(module, r_names, l_names, name='', faster=False):
-    if isinstance(module, Quant3Linear):
-        return
-    for attr in dir(module):
-        tmp = getattr(module, attr)
-        name1 = name + '.' + attr if name != '' else attr
-
 # Assumes layer is perfectly divisible into 256 * 256 blocks
 class QuantLinear(nn.Module): 
     def __init__(self, bits, groupsize, infeatures, outfeatures):
@@ -356,4 +262,4 @@ def make_quant(module, names, bits, groupsize, name=''):
                 module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features)
             )
     for name1, child in module.named_children():
-        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
+        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
\ No newline at end of file

From c57d9c38cb566c6d61480806abd094fa8c076933 Mon Sep 17 00:00:00 2001
From: Fluidstack User <fsuser@cloudvm.example.com>
Date: Sat, 13 May 2023 00:16:10 +0000
Subject: [PATCH 04/12] WIP GPTJ

---
 datautils.py     | 50 +++++++++++++++++++++++++++++-------------------
 gptj.py          | 31 ++++++++++++++++++++++--------
 requirements.txt | 10 +++-------
 3 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/datautils.py b/datautils.py
index 045121a..2616fc0 100644
--- a/datautils.py
+++ b/datautils.py
@@ -6,6 +6,7 @@ def set_seed(seed):
     torch.random.manual_seed(seed)
 
 def get_wikitext2(nsamples, seed, seqlen, model):
+    seqlen = 2048
     from datasets import load_dataset
     traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
     testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
@@ -28,6 +29,7 @@ def get_wikitext2(nsamples, seed, seqlen, model):
     return trainloader, testenc
 
 def get_ptb(nsamples, seed, seqlen, model):
+    seqlen = 2048
     from datasets import load_dataset
     traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
     valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
@@ -50,6 +52,8 @@ def get_ptb(nsamples, seed, seqlen, model):
     return trainloader, testenc
 
 def get_c4(nsamples, seed, seqlen, model):
+    print("loading the c4 dataset")
+    seqlen = 2048
     from datasets import load_dataset
     traindata = load_dataset(
         'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
@@ -65,14 +69,18 @@ def get_c4(nsamples, seed, seqlen, model):
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        while True:
-            i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
-            if trainenc.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
+        i = random.randint(0, len(traindata) - 1)
+        trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
+        # while True:
+        #     i = random.randint(0, len(traindata) - 1)
+        #     trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
+        #     print(trainenc.input_ids.shape)
+        #     if trainenc.input_ids.shape[1] > seqlen - 1:
+        #         break
+        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        # j = i + seqlen
+        # inp = trainenc.input_ids[:, i:j]
+        inp = trainenc.input_ids
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
@@ -80,15 +88,15 @@ def get_c4(nsamples, seed, seqlen, model):
     import random
     random.seed(0)
     valenc = []
-    for _ in range(256):
-        while True:
-            i = random.randint(0, len(valdata) - 1)
-            tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
-            if tmp.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        valenc.append(tmp.input_ids[:, i:j])
+    # for _ in range(256):
+    #     while True:
+    i = random.randint(0, len(valdata) - 1)
+    tmp = tokenizer(valdata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
+    #         if tmp.input_ids.shape[1] >= seqlen:
+    #             break
+    #     i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
+    #     j = i + seqlen
+    valenc.append(tmp.input_ids)
     valenc = torch.hstack(valenc)
     class TokenizerWrapper:
         def __init__(self, input_ids):
@@ -120,6 +128,7 @@ def get_ptb_new(nsamples, seed, seqlen, model):
     return trainloader, testenc
 
 def get_c4_new(nsamples, seed, seqlen, model):
+    print("loading the c4 new dataset")
     from datasets import load_dataset
     traindata = load_dataset(
         'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
@@ -130,14 +139,14 @@ def get_c4_new(nsamples, seed, seqlen, model):
 
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-
+    seqlen = 2048
     import random
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
         while True:
             i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            trainenc = tokenizer(traindata[i]['text'], max_length = 256, truncation=True, return_tensors='pt')
             if trainenc.input_ids.shape[1] >= seqlen:
                 break
         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
@@ -147,7 +156,7 @@ def get_c4_new(nsamples, seed, seqlen, model):
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
 
-    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
+    valenc = tokenizer(' '.join(valdata[:1100]['text']), max_length = 256, truncation=True, return_tensors='pt')
     valenc = valenc.input_ids[:, :(256 * seqlen)]
 
     class TokenizerWrapper:
@@ -161,6 +170,7 @@ def __init__(self, input_ids):
 def get_loaders(
     name, nsamples=128, seed=0, seqlen=2048, model=''
 ):
+    print("loading from dataset ", name)
     if 'wikitext2' in name:
         return get_wikitext2(nsamples, seed, seqlen, model)
     if 'ptb' in name:
diff --git a/gptj.py b/gptj.py
index a41f0ff..cd60177 100644
--- a/gptj.py
+++ b/gptj.py
@@ -27,8 +27,10 @@ def gptj_sequential(model, dataloader, dev, means=None, stds=None):
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
+    print(model.transformer.h)
     layers = model.transformer.h
-
+    print(layers)
+    
     model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
 
@@ -50,6 +52,7 @@ def forward(self, inp, **kwargs):
     layers[0] = Catcher(layers[0])
     for batch in dataloader:
         try:
+            print(batch[0].shape)
             model(batch[0].to(dev))
         except ValueError:
             pass
@@ -119,8 +122,10 @@ def gptj_eval(model, testenc, dev):
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
+    print(model.transformer.h)
     layers = model.transformer.h
-
+    print(layers)
+    
     model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
 
@@ -143,7 +148,13 @@ def forward(self, inp, **kwargs):
     for i in range(nsamples):
         batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
         try:
-            model(batch)
+            print(batch.shape)
+            # question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+            # tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False)
+            # inputs = tokenizer(question, text, return_tensors="pt")
+            # print(inputs.shape)
+            # outputs = model(**inputs)
+            model(**batch)
         except ValueError:
             pass
     layers[0] = layers[0].module
@@ -312,6 +323,7 @@ def sync():
         times = []
         for i in range(input_ids.numel()):
             tick = time.time()
+        
             out = model(
                 input_ids[:, i:i+1],
                 past_key_values=cache['past'],
@@ -346,7 +358,7 @@ def sync():
         help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
     )
     parser.add_argument(
-        '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'],
+        '--dataset', type=str, default='c4', choices=['wikitext2', 'ptb', 'c4'],
         help='Where to extract calibration data from.'
     )
     parser.add_argument(
@@ -394,7 +406,7 @@ def sync():
         help='Whether to compute perpexity during benchmarking for verification.'
     )
 
-
+    print("just confirming that I am actually running this stuff")
     args = parser.parse_args()
 
     if type(args.load) is not str:
@@ -403,13 +415,16 @@ def sync():
     if args.load:
         model = load_quant(args.model, args.load, args.wbits, args.groupsize)
     else:
+        print("getting the model")
         model = get_gptj(args.model)
         model.eval()
-
+        print("Done getting the model")
+        
+    print("Getting data loaders")
     dataloader, testloader = get_loaders(
         args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
     )
-
+    print("finished getting data loaders")
     if not args.load and args.wbits < 16 and not args.nearest:
         tick = time.time()
         quantizers = gptj_sequential(model, dataloader, DEV)
@@ -428,7 +443,7 @@ def sync():
         exit()
 
 
-    for dataset in ['wikitext2', 'ptb', 'c4']:
+    for dataset in ['c4']:
         dataloader, testloader = get_loaders(
             dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
         )
diff --git a/requirements.txt b/requirements.txt
index b5c4f04..321525d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,5 @@
-transformers
-loguru
-datasets
 safetensors==0.3.0
-datasets==2.10.1
+datasets==1.17.0
 sentencepiece
-git+https://github.com/huggingface/transformers
-ninja
-prettytable
\ No newline at end of file
+transformers==4.21.2
+ninja
\ No newline at end of file

From 671fdab60361336ade958501a4e7aa4e5414fd2b Mon Sep 17 00:00:00 2001
From: ningwanyi <sh8400947@qq.com>
Date: Sat, 13 May 2023 03:18:38 +0000
Subject: [PATCH 05/12] gptj working

---
 gptj.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 6 deletions(-)

diff --git a/gptj.py b/gptj.py
index cd60177..7f5b7dd 100644
--- a/gptj.py
+++ b/gptj.py
@@ -8,7 +8,7 @@
 from gptq import *
 from modelutils import *
 from quant import *
-
+from prettytable import PrettyTable
 def get_gptj(model):
     import torch
     def skip(*args, **kwargs):
@@ -17,8 +17,10 @@ def skip(*args, **kwargs):
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
     from transformers import GPTJForCausalLM
+    print(model)
     model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model.seqlen = 2048
+    model.seqlen = model.config.max_position_embeddings
+    print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
     return model
 
 @torch.no_grad()
@@ -154,7 +156,7 @@ def forward(self, inp, **kwargs):
             # inputs = tokenizer(question, text, return_tensors="pt")
             # print(inputs.shape)
             # outputs = model(**inputs)
-            model(**batch)
+            model(batch)
         except ValueError:
             pass
     layers[0] = layers[0].module
@@ -346,6 +348,46 @@ def sync():
 
 
         
+def main(args):
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        model = get_gptj(args.model)
+        model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = gptj_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            gptj_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+    
+
+    dataloader, testloader = get_loaders(
+        args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    print(dataset)
+    ppl = gptj_eval(model, testloader, DEV)
+    
+    if args.save:
+        gptj_pack3(model, quantizers)
+        torch.save(model.state_dict(), args.save)
+        
+    return ppl
 
 if __name__ == '__main__':
     import argparse
@@ -358,7 +400,7 @@ def sync():
         help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
     )
     parser.add_argument(
-        '--dataset', type=str, default='c4', choices=['wikitext2', 'ptb', 'c4'],
+        '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'],
         help='Where to extract calibration data from.'
     )
     parser.add_argument(
@@ -443,7 +485,7 @@ def sync():
         exit()
 
 
-    for dataset in ['c4']:
+    for dataset in ['wikitext2', 'ptb', 'c4']:
         dataloader, testloader = get_loaders(
             dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
         )
@@ -458,4 +500,18 @@ def sync():
     if args.save_safetensors:
         gptj_pack(model, quantizers, args.wbits, args.groupsize)
         from safetensors.torch import save_file as safe_save
-        safe_save(model.state_dict(), args.save_safetensors)
\ No newline at end of file
+        safe_save(model.state_dict(), args.save_safetensors)
+        
+    results = PrettyTable()
+    results.field_names = ['Bits', 'wiki', 'ptb', 'c4']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, ppls[0], ppls[1], ppls[2]])
+        print(results)
+    print('finished.')
\ No newline at end of file

From 72edd428af64a08cf2b3f50d6e0f8dd92d4864df Mon Sep 17 00:00:00 2001
From: kumbong <kumbonghermann@gmail.com>
Date: Sat, 13 May 2023 08:05:45 +0000
Subject: [PATCH 06/12] push gptj eval

---
 datautils.py  |  51 ++---
 evaluation.sh |  76 +++---
 gptj.py       | 178 ++++++++------
 gptj_delta.py | 624 ++++++++++++++++++++++++++++++++++++++++++++++++++
 gptq.py       |  42 ++--
 quant.py      | 112 ++++++++-
 6 files changed, 916 insertions(+), 167 deletions(-)
 create mode 100644 gptj_delta.py

diff --git a/datautils.py b/datautils.py
index 2616fc0..08bfb66 100644
--- a/datautils.py
+++ b/datautils.py
@@ -6,46 +6,45 @@ def set_seed(seed):
     torch.random.manual_seed(seed)
 
 def get_wikitext2(nsamples, seed, seqlen, model):
-    seqlen = 2048
     from datasets import load_dataset
     traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
     testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
 
     from transformers import AutoTokenizer 
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
-    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
+    trainenc = tokenizer("\n\n".join(traindata['text']), max_length = seqlen, truncation=True, return_tensors='pt')
+    testenc = tokenizer("\n\n".join(testdata['text']), max_length = seqlen, truncation=True, return_tensors='pt')
 
     import random
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
+        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        # j = i + seqlen
+        inp = trainenc.input_ids#[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
     return trainloader, testenc
 
 def get_ptb(nsamples, seed, seqlen, model):
-    seqlen = 2048
+    seqlen = seqlen
     from datasets import load_dataset
     traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
     valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
 
     from transformers import AutoTokenizer 
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
-    testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
+    trainenc = tokenizer("\n\n".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
+    testenc = tokenizer("\n\n".join(valdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
 
     import random
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
+        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        # j = i + seqlen
+        inp = trainenc.input_ids#[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
@@ -79,7 +78,7 @@ def get_c4(nsamples, seed, seqlen, model):
         #         break
         # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
         # j = i + seqlen
-        # inp = trainenc.input_ids[:, i:j]
+        inp = trainenc.input_ids#[:, i:j]
         inp = trainenc.input_ids
         tar = inp.clone()
         tar[:, :-1] = -100
@@ -112,16 +111,16 @@ def get_ptb_new(nsamples, seed, seqlen, model):
 
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
-    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')
+    trainenc = tokenizer(" ".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
+    testenc = tokenizer(" ".join(testdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
 
     import random
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
+        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        # j = i + seqlen
+        inp = trainenc.input_ids#[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
@@ -144,14 +143,14 @@ def get_c4_new(nsamples, seed, seqlen, model):
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        while True:
-            i = random.randint(0, len(traindata) - 1)
-            trainenc = tokenizer(traindata[i]['text'], max_length = 256, truncation=True, return_tensors='pt')
-            if trainenc.input_ids.shape[1] >= seqlen:
-                break
-        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        j = i + seqlen
-        inp = trainenc.input_ids[:, i:j]
+        # while True:
+        #     i = random.randint(0, len(traindata) - 1)
+        trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
+            # if trainenc.input_ids.shape[1] >= seqlen:
+            #     break
+        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        # j = i + seqlen
+        inp = trainenc.input_ids#[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
diff --git a/evaluation.sh b/evaluation.sh
index aaea40f..2714383 100755
--- a/evaluation.sh
+++ b/evaluation.sh
@@ -1,52 +1,50 @@
-CUDA_VISIBLE_DEVICES=0 python3 opt_delta.py \
+CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \
     --groupsize 1024 \
     --delta \
-    --benchmark_results "file_0.txt" \
+    --benchmark_results "delta.txt" \
 &
-CUDA_VISIBLE_DEVICES=1 python3 opt_delta.py \
+CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \
     --groupsize 1024 \
     --sparsify_hard_threshold \
     --fraction_of_zero 0.9 \
     --delta \
-    --benchmark_results "file_1.txt" \
+    --benchmark_results "delta_sparse_0.9.txt" 
 &
-CUDA_VISIBLE_DEVICES=2 python3 opt_delta.py \
+CUDA_VISIBLE_DEVICES=2 python3 gptj.py \
     --groupsize 1024 \
     --sparsify_hard_threshold \
     --fraction_of_zero 0.99 \
     --delta \
-    --benchmark_results "file_2.txt" \
+    --benchmark_results "delta_sparse_0.99.txt" \
 &
-CUDA_VISIBLE_DEVICES=3 python3 opt_delta.py \
-    --groupsize 1024 \
-    --delta \
-    --rank 32 \
-    --benchmark_results "file_3.txt" \
-&
-CUDA_VISIBLE_DEVICES=4 python3 opt_delta.py \
-    --groupsize 1024 \
-    --delta \
-    --rank 16  \
-    --benchmark_results "file_4.txt" \
-&
-CUDA_VISIBLE_DEVICES=5 python3 opt_delta.py \
-    --groupsize 1024 \
-    --delta \
-    --rank 64 \
-    --benchmark_results "file_5.txt" \
-&
-CUDA_VISIBLE_DEVICES=6 python3 opt_delta.py \
-    --groupsize 1024 \
-    --delta \
-    --rank 32 \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.9 \
-    --benchmark_results "file_6.txt" \
-&
-CUDA_VISIBLE_DEVICES=7 python3 opt_delta.py \
-    --groupsize 1024 \
-    --delta \
-    --rank 32 \
-    --sparsify_hard_threshold \
-    --fraction_of_zero 0.99 \
-    --benchmark_results "file_7.txt" 
\ No newline at end of file
+CUDA_VISIBLE_DEVICES=3 python3 gptj.py \
+    --groupsize 1024 \
+    --benchmark_results "base.txt" 
+# &
+# CUDA_VISIBLE_DEVICES=4 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 16  \
+#     --benchmark_results "file_4.txt" \
+#&
+# CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 64 \
+#     --benchmark_results "file_5.txt" \
+# &
+# CUDA_VISIBLE_DEVICES=6 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 32 \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.9 \
+#     --benchmark_results "file_6.txt" \
+# &
+# CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \
+#     --groupsize 1024 \
+#     --delta \
+#     --rank 32 \
+#     --sparsify_hard_threshold \
+#     --fraction_of_zero 0.99 \
+#     --benchmark_results "file_7.txt" 
\ No newline at end of file
diff --git a/gptj.py b/gptj.py
index 7f5b7dd..4517509 100644
--- a/gptj.py
+++ b/gptj.py
@@ -9,6 +9,8 @@
 from modelutils import *
 from quant import *
 from prettytable import PrettyTable
+import os
+
 def get_gptj(model):
     import torch
     def skip(*args, **kwargs):
@@ -17,7 +19,7 @@ def skip(*args, **kwargs):
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
     from transformers import GPTJForCausalLM
-    print(model)
+    # print(model)
     model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto')
     model.seqlen = model.config.max_position_embeddings
     print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
@@ -29,9 +31,9 @@ def gptj_sequential(model, dataloader, dev, means=None, stds=None):
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    print(model.transformer.h)
+    #print(model.transformer.h)
     layers = model.transformer.h
-    print(layers)
+    #print(layers)
     
     model.transformer.wte = model.transformer.wte.to(dev)
     layers[0] = layers[0].to(dev)
@@ -54,7 +56,6 @@ def forward(self, inp, **kwargs):
     layers[0] = Catcher(layers[0])
     for batch in dataloader:
         try:
-            print(batch[0].shape)
             model(batch[0].to(dev))
         except ValueError:
             pass
@@ -114,6 +115,99 @@ def tmp(_, inp, out):
 
     return quantizers
 
+@torch.no_grad()
+def gptj_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.transformer.h
+    delta_layers = delta_model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits
+            )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+            gptq[name].free()
+        
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
 
 @torch.no_grad()
 def gptj_eval(model, testenc, dev):
@@ -124,7 +218,7 @@ def gptj_eval(model, testenc, dev):
 
     use_cache = model.config.use_cache
     model.config.use_cache = False
-    print(model.transformer.h)
+    # print(model.transformer.h)
     layers = model.transformer.h
     print(layers)
     
@@ -150,12 +244,6 @@ def forward(self, inp, **kwargs):
     for i in range(nsamples):
         batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
         try:
-            print(batch.shape)
-            # question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-            # tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False)
-            # inputs = tokenizer(question, text, return_tensors="pt")
-            # print(inputs.shape)
-            # outputs = model(**inputs)
             model(batch)
         except ValueError:
             pass
@@ -384,7 +472,7 @@ def main(args):
     ppl = gptj_eval(model, testloader, DEV)
     
     if args.save:
-        gptj_pack3(model, quantizers)
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
         torch.save(model.state_dict(), args.save)
         
     return ppl
@@ -447,63 +535,15 @@ def main(args):
         '--check', action='store_true',
         help='Whether to compute perpexity during benchmarking for verification.'
     )
-
-    print("just confirming that I am actually running this stuff")
-    args = parser.parse_args()
-
-    if type(args.load) is not str:
-        args.load = args.load.as_posix()
-
-    if args.load:
-        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
-    else:
-        print("getting the model")
-        model = get_gptj(args.model)
-        model.eval()
-        print("Done getting the model")
-        
-    print("Getting data loaders")
-    dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
     )
-    print("finished getting data loaders")
-    if not args.load and args.wbits < 16 and not args.nearest:
-        tick = time.time()
-        quantizers = gptj_sequential(model, dataloader, DEV)
-        print(time.time() - tick)
-
-    if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            gptj_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
-        if args.benchmark:
-            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
-            benchmark(model, input_ids, check=args.check)
-    if args.load:
-        exit()
-
-
-    for dataset in ['wikitext2', 'ptb', 'c4']:
-        dataloader, testloader = get_loaders(
-            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
-        )
-        print(dataset)
-        gptj_eval(model, testloader, DEV)
-
-
-    if args.save:
-        gptj_pack(model, quantizers, args.wbits, args.groupsize)
-        torch.save(model.state_dict(), args.save)
-
-    if args.save_safetensors:
-        gptj_pack(model, quantizers, args.wbits, args.groupsize)
-        from safetensors.torch import save_file as safe_save
-        safe_save(model.state_dict(), args.save_safetensors)
+    
+    args = parser.parse_args()
         
     results = PrettyTable()
-    results.field_names = ['Bits', 'wiki', 'ptb', 'c4']
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
     for n_bits in [4, 3, 2]:
         ppls = []
         for dataset in ['wikitext2', 'ptb', 'c4']:
@@ -512,6 +552,8 @@ def main(args):
             args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits)
             ppl = main(args)
             ppls.append(ppl)
-        results.add_row([n_bits, ppls[0], ppls[1], ppls[2]])
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
         print(results)
-    print('finished.')
\ No newline at end of file
+        with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+            f.write(str(results))
+    print('finished.')
diff --git a/gptj_delta.py b/gptj_delta.py
new file mode 100644
index 0000000..2ea5dfc
--- /dev/null
+++ b/gptj_delta.py
@@ -0,0 +1,624 @@
+import time
+import math
+
+import torch
+import torch.nn as nn
+import transformers
+
+from gptq import *
+from modelutils import *
+from quant import *
+from prettytable import PrettyTable
+import os
+import copy
+
+def get_gptj(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import GPTJForCausalLM
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = model.config.max_position_embeddings
+    print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad))
+    return model
+
+@torch.no_grad()
+def gptj_sequential(model, dataloader, dev, means=None, stds=None):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    #print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=False, mse=False
+            )
+        
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.transformer.h
+    delta_layers = delta_model.transformer.h
+
+    model.transformer.wte = model.transformer.wte.to(dev) 
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = delta_layers[i].to(dev)
+        original_layer = layers[i].to(dev)
+
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = Quantizer()
+            gptq[name].quantizer.configure(
+                args.wbits, perchannel=True, sym=args.sym, mse=False
+            )
+
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+
+        for name in subset:
+            print(i, name)
+            print('Quantizing ...')
+            gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+            gptq[name].free()
+        
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+            original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = original_outs, inps
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+@torch.no_grad()
+def gptj_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    # print(model.transformer.h)
+    layers = model.transformer.h
+    print(layers)
+    
+    model.transformer.wte = model.transformer.wte.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache ['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
+        try:
+            # print(batch.shape)
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers = model.transformer.h
+    layers[0] = layers[0].cpu()
+    model.transformer.wte = model.transformer.wte.cpu()
+    model.transformer.ln_f = model.transformer.ln_f.cpu()
+    torch.cuda.empty_cache()
+    
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+
+        if args.nearest:
+            subset  = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    model.transformer.ln_f = model.transformer.ln_f.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        hidden_states = model.transformer.ln_f(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    
+
+    model.config.use_cache = use_cache
+
+def gptj_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name],scale,zero = quantizers[name]
+        quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
+        qlayers[name].pack(layers[name], scale, zero)
+    print('Done!')
+    return model
+
+def load_quant(model, checkpoint, wbits, groupsize):
+    from transformers import GPTJConfig, GPTJForCausalLM
+    config = GPTJConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = GPTJForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits, groupsize)
+
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done!')
+
+    return model
+
+def gptj_multigpu(model, gpus):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[-1])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
+
+    cache = {'mask': None}
+
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self_module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache ['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+
+    layers = model.model.layers
+    pergpu = math.ceil(len(layers) / len(gpus))
+    for i in range(len(layers)):
+        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+
+    model.gpus = gpus
+
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.model.layers):
+        layer.register_forward_hook(clear_past(i))
+
+    print('Benchmarking ...')
+
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+        
+            out = model(
+                input_ids[:, i:i+1],
+                past_key_values=cache['past'],
+                attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1))
+            )
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024)
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_keys_values)
+            del out
+        sync()
+        import numpy as np
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):',max_memory)
+
+
+        
+def main(args):
+    print(args)
+    num_params_saved_lr = 0
+    num_params = 0
+    if args.load:
+        model = load_quant3(args.model, args.load)
+    else:
+        if args.delta and args.wbits<16:
+            model = get_gptj(args.model)
+            model.eval()
+            base_model = get_gptj(args.base_model)
+            base_model.eval()
+            dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+            original_finetuned_model = copy.deepcopy(model)
+            for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+                finetuned_p.data = (finetuned_p.data-base_p.data).clone()
+        else:
+            model = get_gptj(args.model)
+            model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        if args.delta:
+            tick = time.time()
+            quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV)
+
+            comp_time = time.time()-tick
+        else:
+            quantizers = gptj_sequential(model, dataloader, DEV)
+    
+    if args.delta and args.wbits<16:
+        for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+            if args.sparsify_hard_threshold:
+                print('Hard Thresholding...')
+                W = finetuned_p.data
+                finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+            if args.rank>0 and len(finetuned_p.shape) == 2:
+                print('Finding Low Rank Approximation...')
+                A = finetuned_p.data.float()
+                U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5)
+                A  = U @ torch.diag_embed(S) @ Vh.T
+                finetuned_p.data =  A.half()
+                num_params_saved_lr += torch.numel(A) -  (torch.numel(U) + torch.numel(S) + torch.numel(Vh))
+            num_params += torch.numel(finetuned_p.data)
+            finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            gptj_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.load:
+        exit()
+
+    dataset = args.dataset 
+    dataloader, testloader = get_loaders(
+        dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+    
+    ppl = gptj_eval(model, testloader, DEV)
+    print(ppl)
+
+    if args.rank > 0:
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Number of params without low rank ", n_params)
+        print("Number of params with low rank", n_params - num_params_saved_lr)
+    if args.save:
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save) 
+    return ppl
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        '--model', type=str, default='togethercomputer/GPT-JT-6B-v1',
+        help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.'
+    )
+    parser.add_argument(
+        '--base_model', type=str, default='EleutherAI/gpt-j-6b',
+        help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
+    )
+    parser.add_argument(
+        '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    )
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--save', type=str, default='',
+        help='Save the quantized GPT-J model under this name.'
+    )
+    parser.add_argument(
+        '--save_safetensors', type=str, default='',
+        help='Save the quantized GPT-J model as a  `.safetensors` ckpt'
+    )
+    parser.add_argument(
+        '--load', type=str, default='',
+        help='Load the quantized GPT-J model'
+    )
+    parser.add_argument(
+        '--benchmark', type=int, default=0,
+        help='Number of tokens to use for benchmarking.'
+    )
+    parser.add_argument(
+        '--check', action='store_true',
+        help='Whether to compute perpexity during benchmarking for verification.'
+    )
+    parser.add_argument(
+        '--delta', action='store_true',
+        help='Whether to use delta compression'
+    )
+    parser.add_argument(
+        '--sparsify_hard_threshold', action='store_true',
+        help='Whether to add sparsity'
+    )
+    parser.add_argument(
+        '--fraction_of_zero', type=float, default=0.99,
+        help='Sparsity ratio'
+    )
+    parser.add_argument(
+        '--benchmark_results', type=str, default='',
+        help='store benchmark results'
+    )
+    parser.add_argument(
+        '--sym', action='store_true', default=True,
+        help='Whether to use symmetric quantization'
+    )
+    parser.add_argument(
+        '--trits', action='store_true', default=False, 
+        help='Whether to use trits'
+    )
+    parser.add_argument('--act_order', type=str, default=False)
+    
+    args = parser.parse_args()
+        
+    results = PrettyTable()
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [4, 3, 2]:
+        ppls = []
+        for dataset in ['wikitext2', 'ptb', 'c4']:
+            args.dataset = dataset
+            args.wbits = n_bits
+            args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits)
+            ppl = main(args)
+            ppls.append(ppl)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+        print(results)
+        with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
+            f.write(str(results))
+    print('finished.')
diff --git a/gptq.py b/gptq.py
index b4546cc..8f719e1 100644
--- a/gptq.py
+++ b/gptq.py
@@ -1,19 +1,16 @@
 import math
 import time
-
 import torch
-import torch.nn as nn
 import transformers
+import torch.nn as nn
 
-from quant import *
-
+from quant import quantize
 
 DEBUG = False 
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
-
 class GPTQ:
     def __init__(self, layer):
         self.layer = layer
@@ -57,7 +54,7 @@ def add_batch(self, inp, out):
         self.H += inp.matmul(inp.t())
 
     def fasterquant(
-        self, blocksize=128, percdamp=.01, groupsize=-1
+        self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False
     ):
         W = self.layer.weight.data.clone()
         if isinstance(self.layer, nn.Conv2d):
@@ -77,6 +74,11 @@ def fasterquant(
         H[dead, dead] = 1
         W[:, dead] = 0
 
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+
         Losses = torch.zeros_like(W)
         Q = torch.zeros_like(W)
 
@@ -87,10 +89,6 @@ def fasterquant(
         H = torch.cholesky_inverse(H)
         H = torch.linalg.cholesky(H, upper=True)
         Hinv = H
-        
-        scale = []
-        zero = []
-        now_idx = 1
 
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
@@ -109,11 +107,6 @@ def fasterquant(
                 if groupsize != -1:
                     if (i1 + i) % groupsize == 0:
                         self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
-                    
-                    if ((i1 + i) // groupsize) - now_idx == -1:
-                        scale.append(self.quantizer.scale)
-                        zero.append(self.quantizer.zero)
-                        now_idx += 1
 
                 q = quantize(
                     w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq
@@ -137,22 +130,21 @@ def fasterquant(
                 print(torch.sum(Losses))
 
         torch.cuda.synchronize()
-        print('time %.2f' % (time.time() - tick))
-        print('error', torch.sum(Losses).item())
+        total_time = time.time() - tick
+        # print('time %.2f' % total_time)
+        error = torch.sum(Losses).item()
+        # print('error', error)
+
+        if actorder:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
 
         if isinstance(self.layer, transformers.Conv1D):
             Q = Q.t()
         self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         if DEBUG:
             print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-            
-        if scale == []:
-            scale.append(self.quantizer.scale)
-            zero.append(self.quantizer.zero)
-        scale = torch.cat(scale,dim=1)
-        zero = torch.cat(zero,dim=1)
-        return scale,zero
-            
+
     def free(self):
         if DEBUG:
             self.inp1 = None
diff --git a/quant.py b/quant.py
index fe58148..946da79 100644
--- a/quant.py
+++ b/quant.py
@@ -1,9 +1,11 @@
+import math
 import numpy as np
 import torch
 import torch.nn as nn
-import math
 
 def quantize(x, scale, zero, maxq):
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
     q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
     return scale * (q - zero)
 
@@ -16,10 +18,11 @@ def __init__(self, shape=1):
         self.register_buffer('zero', torch.zeros(shape))
 
     def configure(
-            self,
-            bits, perchannel=False, sym=True, 
-            mse=False, norm=2.4, grid=100, maxshrink=.8
-        ):
+        self,
+        bits, perchannel=False, sym=True, 
+        mse=False, norm=2.4, grid=100, maxshrink=.8,
+        trits=False
+    ):
         self.maxq = torch.tensor(2 ** bits - 1)
         self.perchannel = perchannel
         self.sym = sym
@@ -27,6 +30,8 @@ def configure(
         self.norm = norm
         self.grid = grid
         self.maxshrink = maxshrink 
+        if trits:
+            self.maxq = torch.tensor(-1) 
 
     def find_params(self, x, weight=False):
         dev = x.device
@@ -60,11 +65,15 @@ def find_params(self, x, weight=False):
         xmin[tmp] = -1
         xmax[tmp] = +1
 
-        self.scale = (xmax - xmin) / self.maxq
-        if self.sym:
-            self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+        if self.maxq < 0:
+          self.scale = xmax
+          self.zero = xmin
         else:
-            self.zero = torch.round(-xmin / self.scale)
+          self.scale = (xmax - xmin) / self.maxq
+          if self.sym:
+              self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+          else:
+              self.zero = torch.round(-xmin / self.scale)
 
         if self.mse:
             best = torch.full([x.shape[0]], float('inf'), device=dev)
@@ -124,6 +133,91 @@ def ready(self):
 except:
     print('CUDA extension not installed.')
 
+# Assumes layer is perfectly divisible into 1024 * 1024 blocks
+class Quant3Linear(nn.Module): 
+
+    def __init__(self, infeatures, outfeatures, faster=False):
+        super().__init__()
+        self.register_buffer('zeros', torch.zeros((outfeatures, 1)))
+        self.register_buffer('scales', torch.zeros((outfeatures, 1)))
+        self.register_buffer('bias', torch.zeros(outfeatures))
+        self.register_buffer(
+            'qweight', torch.zeros((infeatures // 32 * 3, outfeatures), dtype=torch.int)
+        )
+        self.faster = faster
+
+    def pack(self, linear, scales, zeros):
+        self.zeros = zeros * scales
+        self.scales = scales.clone()
+        self.bias = linear.bias.clone()
+
+        intweight = torch.round((linear.weight.data + self.zeros) / self.scales).to(torch.int)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * 3, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i))
+            i += 10
+            qweight[row] |= intweight[i] << 30
+            row += 1
+            qweight[row] |= (intweight[i] >> 2) & 1
+            i += 1
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+            i += 10
+            qweight[row] |= intweight[i] << 31
+            row += 1
+            qweight[row] |= (intweight[i] >> 1) & 0x3
+            i += 1
+            for j in range(i, i + 10):
+                qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+            i += 10
+            row += 1
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight) 
+
+    def forward(self, x):
+        if x.shape[-1] == x.numel():
+            outshape = list(x.shape)
+            y = self.bias.clone()
+            outshape[-1] = self.bias.numel()
+            dtype = x.dtype
+            if self.faster:
+                x = x.half()
+                quant_cuda.vecquant3matmul_faster(x, self.qweight, y, self.scales, self.zeros)
+            else:
+                x = x.float()
+                quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.zeros)
+            y = y.to(dtype)
+            return y.reshape(outshape)
+        raise ValueError('Only supports a single token currently.')
+
+def make_quant3(module, names, name='', faster=False):
+    if isinstance(module, Quant3Linear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            setattr(
+                module, attr, Quant3Linear(tmp.in_features, tmp.out_features, faster=faster)
+            )
+    for name1, child in module.named_children():
+        make_quant3(child, names, name + '.' + name1 if name != '' else name1, faster=faster)
+
+def make_quant_lr(module, r_names, l_names, name='', faster=False):
+    if isinstance(module, Quant3Linear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+
 # Assumes layer is perfectly divisible into 256 * 256 blocks
 class QuantLinear(nn.Module): 
     def __init__(self, bits, groupsize, infeatures, outfeatures):

From 5025ab4d26b5b1565bf67bfcf155d78e2e0e9963 Mon Sep 17 00:00:00 2001
From: kumbong <kumbonghermann@gmail.com>
Date: Sat, 13 May 2023 23:32:17 +0000
Subject: [PATCH 07/12] update gptj

---
 .gitignore    |   1 +
 datautils.py  | 131 +++++-----------------
 delta.txt     |   5 +
 file_5.txt    |  12 +-
 gptj.py       |  35 ++++--
 gptj_delta.py |  37 +++++--
 gptq.py       |  12 +-
 llama.py      | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++
 quant.py      |   4 +-
 9 files changed, 402 insertions(+), 137 deletions(-)
 create mode 100644 delta.txt
 create mode 100644 llama.py

diff --git a/.gitignore b/.gitignore
index 761a9f6..83ed498 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 *.pyc
+*.pt
 build/
 dist/
 .idea
diff --git a/datautils.py b/datautils.py
index 08bfb66..a269a22 100644
--- a/datautils.py
+++ b/datautils.py
@@ -1,10 +1,12 @@
 import numpy as np
 import torch
 
+
 def set_seed(seed):
     np.random.seed(seed)
     torch.random.manual_seed(seed)
 
+
 def get_wikitext2(nsamples, seed, seqlen, model):
     from datasets import load_dataset
     traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
@@ -12,47 +14,44 @@ def get_wikitext2(nsamples, seed, seqlen, model):
 
     from transformers import AutoTokenizer 
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer("\n\n".join(traindata['text']), max_length = seqlen, truncation=True, return_tensors='pt')
-    testenc = tokenizer("\n\n".join(testdata['text']), max_length = seqlen, truncation=True, return_tensors='pt')
+    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
 
     import random
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        # j = i + seqlen
-        inp = trainenc.input_ids#[:, i:j]
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
     return trainloader, testenc
 
 def get_ptb(nsamples, seed, seqlen, model):
-    seqlen = seqlen
     from datasets import load_dataset
     traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
     valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
 
     from transformers import AutoTokenizer 
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer("\n\n".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
-    testenc = tokenizer("\n\n".join(valdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
+    trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
 
     import random
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        # j = i + seqlen
-        inp = trainenc.input_ids#[:, i:j]
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
     return trainloader, testenc
 
 def get_c4(nsamples, seed, seqlen, model):
-    print("loading the c4 dataset")
-    seqlen = 2048
     from datasets import load_dataset
     traindata = load_dataset(
         'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
@@ -68,18 +67,14 @@ def get_c4(nsamples, seed, seqlen, model):
     random.seed(seed)
     trainloader = []
     for _ in range(nsamples):
-        i = random.randint(0, len(traindata) - 1)
-        trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
-        # while True:
-        #     i = random.randint(0, len(traindata) - 1)
-        #     trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
-        #     print(trainenc.input_ids.shape)
-        #     if trainenc.input_ids.shape[1] > seqlen - 1:
-        #         break
-        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        # j = i + seqlen
-        inp = trainenc.input_ids#[:, i:j]
-        inp = trainenc.input_ids
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
         tar = inp.clone()
         tar[:, :-1] = -100
         trainloader.append((inp, tar))
@@ -87,15 +82,15 @@ def get_c4(nsamples, seed, seqlen, model):
     import random
     random.seed(0)
     valenc = []
-    # for _ in range(256):
-    #     while True:
-    i = random.randint(0, len(valdata) - 1)
-    tmp = tokenizer(valdata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
-    #         if tmp.input_ids.shape[1] >= seqlen:
-    #             break
-    #     i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
-    #     j = i + seqlen
-    valenc.append(tmp.input_ids)
+    for _ in range(256):
+        while True:
+            i = random.randint(0, len(valdata) - 1)
+            tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
+            if tmp.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        valenc.append(tmp.input_ids[:, i:j])
     valenc = torch.hstack(valenc)
     class TokenizerWrapper:
         def __init__(self, input_ids):
@@ -104,79 +99,13 @@ def __init__(self, input_ids):
 
     return trainloader, valenc 
 
-def get_ptb_new(nsamples, seed, seqlen, model):
-    from datasets import load_dataset
-    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
-    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
-
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer(" ".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
-    testenc = tokenizer(" ".join(testdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt')
-
-    import random
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        # j = i + seqlen
-        inp = trainenc.input_ids#[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-    return trainloader, testenc
-
-def get_c4_new(nsamples, seed, seqlen, model):
-    print("loading the c4 new dataset")
-    from datasets import load_dataset
-    traindata = load_dataset(
-        'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
-    )
-    valdata = load_dataset(
-        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation'
-    )
-
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    seqlen = 2048
-    import random
-    random.seed(seed)
-    trainloader = []
-    for _ in range(nsamples):
-        # while True:
-        #     i = random.randint(0, len(traindata) - 1)
-        trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt')
-            # if trainenc.input_ids.shape[1] >= seqlen:
-            #     break
-        # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
-        # j = i + seqlen
-        inp = trainenc.input_ids#[:, i:j]
-        tar = inp.clone()
-        tar[:, :-1] = -100
-        trainloader.append((inp, tar))
-
-    valenc = tokenizer(' '.join(valdata[:1100]['text']), max_length = 256, truncation=True, return_tensors='pt')
-    valenc = valenc.input_ids[:, :(256 * seqlen)]
-
-    class TokenizerWrapper:
-        def __init__(self, input_ids):
-            self.input_ids = input_ids
-    valenc = TokenizerWrapper(valenc)
-
-    return trainloader, valenc
-
 
 def get_loaders(
     name, nsamples=128, seed=0, seqlen=2048, model=''
 ):
-    print("loading from dataset ", name)
     if 'wikitext2' in name:
         return get_wikitext2(nsamples, seed, seqlen, model)
     if 'ptb' in name:
-        if 'new' in name:
-            return get_ptb_new(nsamples, seed, seqlen, model)
         return get_ptb(nsamples, seed, seqlen, model)
     if 'c4' in name:
-        if 'new' in name:
-            return get_c4_new(nsamples, seed, seqlen, model)
-        return get_c4(nsamples, seed, seqlen, model)
+        return get_c4(nsamples, seed, seqlen, model)
\ No newline at end of file
diff --git a/delta.txt b/delta.txt
new file mode 100644
index 0000000..a1c817b
--- /dev/null
+++ b/delta.txt
@@ -0,0 +1,5 @@
++------+-----------+------+
+| Bits | wikitext2 | ptb  |
++------+-----------+------+
+|  4   |    None   | None |
++------+-----------+------+
\ No newline at end of file
diff --git a/file_5.txt b/file_5.txt
index 2655a74..256a39d 100644
--- a/file_5.txt
+++ b/file_5.txt
@@ -1,7 +1,5 @@
-+------+-----------+--------------------+--------------------+--------------------+--------------------+
-| Bits |  n_params |        Time        |        wiki        |        ptb         |         c4         |
-+------+-----------+--------------------+--------------------+--------------------+--------------------+
-|  2   | 107356160 | 262.34580183029175 | 12.373908996582031 | 18.664175033569336 | 15.718718528747559 |
-|  3   | 107356160 | 271.34020018577576 | 12.558426856994629 | 19.13666343688965  | 16.14783477783203  |
-|  4   | 107356160 | 255.5096390247345  | 12.59843921661377  | 19.159931182861328 | 16.166603088378906 |
-+------+-----------+--------------------+--------------------+--------------------+--------------------+
\ No newline at end of file
++------+-----------+--------------------+--------------------+--------------------+
+| Bits |  n_params |        wiki        |        ptb         |         c4         |
++------+-----------+--------------------+--------------------+--------------------+
+|  4   | 107356160 | 2.9947431087493896 | 1.011309266090393  | 1.0010896921157837 |
++------+-----------+--------------------+--------------------+--------------------+
\ No newline at end of file
diff --git a/gptj.py b/gptj.py
index 4517509..a98cdd0 100644
--- a/gptj.py
+++ b/gptj.py
@@ -98,8 +98,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            print(i, name)
-            print('Quantizing ...')
+            # print(i, name)
+            # print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
@@ -188,8 +188,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            print(i, name)
-            print('Quantizing ...')
+            # print(i, name)
+            # print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
             quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
             gptq[name].free()
@@ -259,7 +259,7 @@ def forward(self, inp, **kwargs):
     attention_mask = cache['attention_mask']
 
     for i in range(len(layers)):
-        print(i)
+        #print(i)
         layer = layers[i].to(dev)
 
         if args.nearest:
@@ -310,13 +310,13 @@ def gptj_pack(model, quantizers, wbits, groupsize):
     layers = {n: layers[n] for n in quantizers}
     make_quant(model, quantizers, wbits, groupsize)
     qlayers = find_layers(model, [QuantLinear])
-    print('Packing ...')
+    #print('Packing ...')
     for name in qlayers:
         print(name)
         quantizers[name],scale,zero = quantizers[name]
         quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
         qlayers[name].pack(layers[name], scale, zero)
-    print('Done!')
+    #print('Done!')
     return model
 
 def load_quant(model, checkpoint, wbits, groupsize):
@@ -442,6 +442,12 @@ def main(args):
     else:
         model = get_gptj(args.model)
         model.eval()
+        dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+        )
+        ppl = gptj_eval(model, testloader, DEV)
+        print(ppl)
+        exit()
 
     dataloader, testloader = get_loaders(
         args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
@@ -484,7 +490,7 @@ def main(args):
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        '--model', type=str, default='EleutherAI/gpt-j-6b',
+        '--model', type=str, default='togethercomputer/GPT-JT-6B-v1',
         help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
     )
     parser.add_argument(
@@ -539,20 +545,27 @@ def main(args):
         '--benchmark_results', type=str, default='',
         help='store benchmark results'
     )
+    parser.add_argument(
+        '--rank', type=int, default=0,
+        help='The rank to use for decomposing each matrices'
+    )
     
     args = parser.parse_args()
         
     results = PrettyTable()
-    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    results.field_names = ['Bits', 'wikitext2', 'ptb']
     for n_bits in [4, 3, 2]:
         ppls = []
-        for dataset in ['wikitext2', 'ptb', 'c4']:
+        for dataset in ['wikitext2', 'ptb']:
+            print(n_bits)
+            print(dataset)
             args.dataset = dataset
             args.wbits = n_bits
             args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits)
             ppl = main(args)
             ppls.append(ppl)
-        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+            print(ppl)
+        results.add_row([n_bits] + ppls)
         print(results)
         with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
             f.write(str(results))
diff --git a/gptj_delta.py b/gptj_delta.py
index 2ea5dfc..d2302e6 100644
--- a/gptj_delta.py
+++ b/gptj_delta.py
@@ -12,6 +12,15 @@
 import os
 import copy
 
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    return mask * x
+
 def get_gptj(model):
     import torch
     def skip(*args, **kwargs):
@@ -98,8 +107,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            print(i, name)
-            print('Quantizing ...')
+            # print(i, name)
+            # print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
@@ -188,8 +197,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            print(i, name)
-            print('Quantizing ...')
+            # print(i, name)
+            # print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
             quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
             gptq[name].free()
@@ -506,15 +515,16 @@ def main(args):
     )
     
     ppl = gptj_eval(model, testloader, DEV)
-    print(ppl)
+    print('perpexity for model is', ppl)
 
     if args.rank > 0:
         n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
         print("Number of params without low rank ", n_params)
         print("Number of params with low rank", n_params - num_params_saved_lr)
     if args.save:
-        gptj_pack(model, quantizers, args.wbits, args.groupsize)
-        torch.save(model.state_dict(), args.save) 
+        # gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        # torch.save(model.state_dict(), args.save) 
+        pass
     return ppl
 
 if __name__ == '__main__':
@@ -603,21 +613,28 @@ def main(args):
         '--trits', action='store_true', default=False, 
         help='Whether to use trits'
     )
+    parser.add_argument(
+        '--rank', type=int, default=0,
+        help='The rank to use for decomposing each matrices'
+    )
     parser.add_argument('--act_order', type=str, default=False)
     
     args = parser.parse_args()
         
     results = PrettyTable()
-    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    results.field_names = ['Bits', 'wikitext2', 'ptb']
     for n_bits in [4, 3, 2]:
         ppls = []
-        for dataset in ['wikitext2', 'ptb', 'c4']:
+        for dataset in ['wikitext2', 'ptb']:
+            print(n_bits)
+            print(dataset)
             args.dataset = dataset
             args.wbits = n_bits
             args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits)
             ppl = main(args)
             ppls.append(ppl)
-        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
+            print(ppl)
+        results.add_row([n_bits] + ppls)
         print(results)
         with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
             f.write(str(results))
diff --git a/gptq.py b/gptq.py
index 8f719e1..e60f1fc 100644
--- a/gptq.py
+++ b/gptq.py
@@ -126,14 +126,14 @@ def fasterquant(
             if DEBUG:
                 self.layer.weight.data[:, :i2] = Q[:, :i2]
                 self.layer.weight.data[:, i2:] = W[:, i2:]
-                print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-                print(torch.sum(Losses))
+                #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                #print(torch.sum(Losses))
 
         torch.cuda.synchronize()
         total_time = time.time() - tick
-        # print('time %.2f' % total_time)
+        # #print('time %.2f' % total_time)
         error = torch.sum(Losses).item()
-        # print('error', error)
+        # #print('error', error)
 
         if actorder:
             invperm = torch.argsort(perm)
@@ -143,8 +143,8 @@ def fasterquant(
             Q = Q.t()
         self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
         if DEBUG:
-            print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
-
+            #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+            pass
     def free(self):
         if DEBUG:
             self.inp1 = None
diff --git a/llama.py b/llama.py
new file mode 100644
index 0000000..f1591d9
--- /dev/null
+++ b/llama.py
@@ -0,0 +1,302 @@
+import time
+
+import torch
+import torch.nn as nn
+
+from gptq import *
+from modelutils import *
+from quant import *
+
+
+def get_llama(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = 2048
+    return model
+
+@torch.no_grad()
+def llama_sequential(model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+
+        if args.true_sequential:
+            sequential = [
+                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
+                ['self_attn.o_proj'],
+                ['mlp.up_proj', 'mlp.gate_proj'],
+                ['mlp.down_proj']
+            ]
+        else:
+            sequential = [list(full.keys())]
+       
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+                gptq[name].free()
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def llama_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.norm is not None:
+        model.model.norm = model.model.norm.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.norm is not None:
+            hidden_states = model.model.norm(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        'model', type=str,
+        help='LlaMa model to load; pass location of hugginface converted checkpoint.'
+    )
+    parser.add_argument(
+        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    ) 
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--sym', action='store_true',
+        help='Whether to perform symmetric quantization.'
+    )
+    parser.add_argument(
+        '--new-eval', action='store_true',
+        help='Whether to use the new PTB and C4 eval.'
+    )
+    parser.add_argument(
+        '--act-order', action='store_true',
+        help='Whether to apply the activation order GPTQ heuristic'
+    )
+    parser.add_argument(
+        '--true-sequential', action='store_true',
+        help='Whether to run in true sequential model.'
+    )
+
+    args = parser.parse_args()
+
+    model = get_llama(args.model)
+    model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    if args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = llama_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+
+    datasets = ['wikitext2', 'ptb', 'c4'] 
+    if args.new_eval:
+      datasets = ['wikitext2', 'ptb-new', 'c4-new']
+    for dataset in datasets:
+        dataloader, testloader = get_loaders(
+            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+        )
+        print(dataset)
+        llama_eval(model, testloader, DEV)
\ No newline at end of file
diff --git a/quant.py b/quant.py
index 946da79..f57d6d7 100644
--- a/quant.py
+++ b/quant.py
@@ -131,8 +131,8 @@ def ready(self):
 try:
     import quant_cuda
 except:
-    print('CUDA extension not installed.')
-
+    #print('CUDA extension not installed.')
+    pass
 # Assumes layer is perfectly divisible into 1024 * 1024 blocks
 class Quant3Linear(nn.Module): 
 

From 4cae72910120b22043e203f316134042e54a6c9d Mon Sep 17 00:00:00 2001
From: kumbong <kumbonghermann@gmail.com>
Date: Sun, 14 May 2023 06:26:46 +0000
Subject: [PATCH 08/12] WIP

---
 delta_2_bits.txt           |  32 +++
 delta_2bits_sparse_099.txt |   8 +
 delta_4bits.txt            |  30 +++
 delta_4bits_sparse_09.txt  |   8 +
 delta_sparse_09.txt        |   5 +
 delta_sparse_099.txt       |   5 +
 evaluation.sh              |  48 ++--
 file_5.txt                 |   7 +-
 gptj.py                    |  67 ++----
 gptj_delta.py              |  80 ++-----
 llama_delta.py             | 441 +++++++++++++++++++++++++++++++++++++
 11 files changed, 604 insertions(+), 127 deletions(-)
 create mode 100644 delta_2_bits.txt
 create mode 100644 delta_2bits_sparse_099.txt
 create mode 100644 delta_4bits.txt
 create mode 100644 delta_4bits_sparse_09.txt
 create mode 100644 delta_sparse_09.txt
 create mode 100644 delta_sparse_099.txt
 create mode 100644 llama_delta.py

diff --git a/delta_2_bits.txt b/delta_2_bits.txt
new file mode 100644
index 0000000..41b76c4
--- /dev/null
+++ b/delta_2_bits.txt
@@ -0,0 +1,32 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt
new file mode 100644
index 0000000..cfb6519
--- /dev/null
+++ b/delta_2bits_sparse_099.txt
@@ -0,0 +1,8 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
diff --git a/delta_4bits.txt b/delta_4bits.txt
new file mode 100644
index 0000000..39698f4
--- /dev/null
+++ b/delta_4bits.txt
@@ -0,0 +1,30 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt
new file mode 100644
index 0000000..cfb6519
--- /dev/null
+++ b/delta_4bits_sparse_09.txt
@@ -0,0 +1,8 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
diff --git a/delta_sparse_09.txt b/delta_sparse_09.txt
new file mode 100644
index 0000000..a1c817b
--- /dev/null
+++ b/delta_sparse_09.txt
@@ -0,0 +1,5 @@
++------+-----------+------+
+| Bits | wikitext2 | ptb  |
++------+-----------+------+
+|  4   |    None   | None |
++------+-----------+------+
\ No newline at end of file
diff --git a/delta_sparse_099.txt b/delta_sparse_099.txt
new file mode 100644
index 0000000..a1c817b
--- /dev/null
+++ b/delta_sparse_099.txt
@@ -0,0 +1,5 @@
++------+-----------+------+
+| Bits | wikitext2 | ptb  |
++------+-----------+------+
+|  4   |    None   | None |
++------+-----------+------+
\ No newline at end of file
diff --git a/evaluation.sh b/evaluation.sh
index 2714383..92d2232 100755
--- a/evaluation.sh
+++ b/evaluation.sh
@@ -1,25 +1,41 @@
-CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \
+CUDA_VISIBLE_DEVICES=7 python3 -u llama_delta.py \
+    --wbits 4 \
+    --true-sequential --act-order --new-eval\
+    --groupsize 1024  > delta_4bits.txt & 
+
+CUDA_VISIBLE_DEVICES=2 python3 -u llama_delta.py \
+    --wbits 2 \
+    --true-sequential --act-order --new-eval\
+    --groupsize 1024  > delta_2_bits.txt & 
+
+CUDA_VISIBLE_DEVICES=6 python3 -u llama_delta.py \
     --groupsize 1024 \
-    --delta \
-    --benchmark_results "delta.txt" \
-&
-CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \
+    --wbits 4 \
+    --true-sequential --act-order --new-eval\
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.9 > delta_4bits_sparse_09.txt &
+
+CUDA_VISIBLE_DEVICES=5 python3 -u llama_delta.py \
     --groupsize 1024 \
+    --wbits 2 \
+    --true-sequential --act-order --new-eval\
     --sparsify_hard_threshold \
-    --fraction_of_zero 0.9 \
-    --delta \
-    --benchmark_results "delta_sparse_0.9.txt" 
-&
-CUDA_VISIBLE_DEVICES=2 python3 gptj.py \
+    --fraction_of_zero 0.9 > delta_2bits_sparse_099.txt &
+
+CUDA_VISIBLE_DEVICES=4 python3 -u llama_delta.py \
     --groupsize 1024 \
+    --wbits 4 \
+    --true-sequential --act-order --new-eval\
     --sparsify_hard_threshold \
-    --fraction_of_zero 0.99 \
-    --delta \
-    --benchmark_results "delta_sparse_0.99.txt" \
-&
-CUDA_VISIBLE_DEVICES=3 python3 gptj.py \
+    --fraction_of_zero 0.99 > delta_4bits_sparse_09.txt &
+
+CUDA_VISIBLE_DEVICES=3 python3 -u llama_delta.py \
     --groupsize 1024 \
-    --benchmark_results "base.txt" 
+    --wbits 2 \
+    --true-sequential --act-order --new-eval\
+    --sparsify_hard_threshold \
+    --fraction_of_zero 0.99 > delta_2bits_sparse_099.txt &
+
 # &
 # CUDA_VISIBLE_DEVICES=4 python3 gptj_delta.py \
 #     --groupsize 1024 \
diff --git a/file_5.txt b/file_5.txt
index 256a39d..5ef55d7 100644
--- a/file_5.txt
+++ b/file_5.txt
@@ -1,5 +1,10 @@
+LLAMA - Experiment results
+
 +------+-----------+--------------------+--------------------+--------------------+
 | Bits |  n_params |        wiki        |        ptb         |         c4         |
 +------+-----------+--------------------+--------------------+--------------------+
 |  4   | 107356160 | 2.9947431087493896 | 1.011309266090393  | 1.0010896921157837 |
-+------+-----------+--------------------+--------------------+--------------------+
\ No newline at end of file
++------+-----------+--------------------+--------------------+--------------------+
+|  4   | 107356160 | 2.9947431087493896 | 1.011309266090393  | 1.0010896921157837 |
++------+-----------+--------------------+--------------------+--------------------+
+
diff --git a/gptj.py b/gptj.py
index a98cdd0..0ae4900 100644
--- a/gptj.py
+++ b/gptj.py
@@ -98,8 +98,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            # print(i, name)
-            # print('Quantizing ...')
+            print(i, name)
+            print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
@@ -142,6 +142,7 @@ def forward(self, inp, **kwargs):
             cache['i'] += 1
             cache['attention_mask'] = kwargs['attention_mask']
             raise ValueError
+    
     layers[0] = Catcher(layers[0])
     for batch in dataloader:
         try:
@@ -188,8 +189,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            # print(i, name)
-            # print('Quantizing ...')
+            print(i, name)
+            print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
             quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
             gptq[name].free()
@@ -259,7 +260,7 @@ def forward(self, inp, **kwargs):
     attention_mask = cache['attention_mask']
 
     for i in range(len(layers)):
-        #print(i)
+        print(i)
         layer = layers[i].to(dev)
 
         if args.nearest:
@@ -310,13 +311,13 @@ def gptj_pack(model, quantizers, wbits, groupsize):
     layers = {n: layers[n] for n in quantizers}
     make_quant(model, quantizers, wbits, groupsize)
     qlayers = find_layers(model, [QuantLinear])
-    #print('Packing ...')
+    print('Packing ...')
     for name in qlayers:
         print(name)
         quantizers[name],scale,zero = quantizers[name]
         quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu()
         qlayers[name].pack(layers[name], scale, zero)
-    #print('Done!')
+    print('Done!')
     return model
 
 def load_quant(model, checkpoint, wbits, groupsize):
@@ -383,6 +384,7 @@ def forward(self, *inp, **kwargs):
     model.gpus = gpus
 
 def benchmark(model, input_ids, check=False):
+    print(model)
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
     torch.cuda.synchronize()
 
@@ -392,7 +394,7 @@ def tmp(layer, inp, out):
             if cache['past']:
                 cache['past'][i] = None
         return tmp
-    for i, layer in enumerate(model.model.layers):
+    for i, layer in enumerate(model.transformer.h):
         layer.register_forward_hook(clear_past(i))
 
     print('Benchmarking ...')
@@ -421,55 +423,40 @@ def sync():
             )
             sync()
             times.append(time.time() - tick)
-            print(i, times[-1])
-            max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024)
-            if check and i != input_ids.numel() - 1:
+            if i != input_ids.numel() - 1:
                 tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
-            cache['past'] = list(out.past_keys_values)
+            cache['past'] = list(out.past_key_values)
             del out
         sync()
         import numpy as np
         print('Median:', np.median(times))
-        if check:
-            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
-            print('max memory(MiB):',max_memory)
-
-
+        print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+        print('max memory(MiB):',max_memory)
         
 def main(args):
+    print(args)
     if args.load:
         model = load_quant3(args.model, args.load)
     else:
         model = get_gptj(args.model)
         model.eval()
-        dataloader, testloader = get_loaders(
-        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
-        )
-        ppl = gptj_eval(model, testloader, DEV)
-        print(ppl)
-        exit()
 
     dataloader, testloader = get_loaders(
         args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
     )
 
     if args.wbits < 16 and not args.nearest:
-        tick = time.time()
+        print("Quantizing ...")
         quantizers = gptj_sequential(model, dataloader, DEV)
         print(time.time() - tick)
 
     if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            gptj_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
+        model = model.to(DEV)
         if args.benchmark:
             input_ids = next(iter(dataloader))[0][:, :args.benchmark]
             benchmark(model, input_ids, check=args.check)
     if args.load:
         exit()
-    
 
     dataloader, testloader = get_loaders(
         args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
@@ -490,7 +477,7 @@ def main(args):
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        '--model', type=str, default='togethercomputer/GPT-JT-6B-v1',
+        '--model', type=str, default='EleutherAI/gpt-j-6b',
         help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.'
     )
     parser.add_argument(
@@ -545,28 +532,20 @@ def main(args):
         '--benchmark_results', type=str, default='',
         help='store benchmark results'
     )
-    parser.add_argument(
-        '--rank', type=int, default=0,
-        help='The rank to use for decomposing each matrices'
-    )
     
     args = parser.parse_args()
-        
     results = PrettyTable()
-    results.field_names = ['Bits', 'wikitext2', 'ptb']
-    for n_bits in [4, 3, 2]:
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
+    for n_bits in [16]:
         ppls = []
-        for dataset in ['wikitext2', 'ptb']:
-            print(n_bits)
-            print(dataset)
+        for dataset in ['wikitext2']:
             args.dataset = dataset
             args.wbits = n_bits
             args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits)
             ppl = main(args)
             ppls.append(ppl)
-            print(ppl)
-        results.add_row([n_bits] + ppls)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
         print(results)
         with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
             f.write(str(results))
-    print('finished.')
+    print('finished.')
\ No newline at end of file
diff --git a/gptj_delta.py b/gptj_delta.py
index d2302e6..490e3e5 100644
--- a/gptj_delta.py
+++ b/gptj_delta.py
@@ -12,15 +12,6 @@
 import os
 import copy
 
-
-def hard_threshold(x, fraction_of_zero=0.1):
-    y, _ = torch.sort(x.view(-1).abs().clone())
-    num_params = torch.numel(x)
-    thresh_index = int(num_params * fraction_of_zero)
-    threshold = y[thresh_index]
-    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
-    return mask * x
-
 def get_gptj(model):
     import torch
     def skip(*args, **kwargs):
@@ -107,8 +98,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            # print(i, name)
-            # print('Quantizing ...')
+            print(i, name)
+            print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize)
         for j in range(args.nsamples):
             outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
@@ -197,8 +188,8 @@ def tmp(_, inp, out):
             h.remove()
 
         for name in subset:
-            # print(i, name)
-            # print('Quantizing ...')
+            print(i, name)
+            print('Quantizing ...')
             gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
             quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
             gptq[name].free()
@@ -253,7 +244,7 @@ def forward(self, inp, **kwargs):
     for i in range(nsamples):
         batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev)
         try:
-            # print(batch.shape)
+            print(batch.shape)
             model(batch)
         except ValueError:
             pass
@@ -361,37 +352,6 @@ def noop(*args, **kwargs):
 
     return model
 
-def gptj_multigpu(model, gpus):
-    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
-    if hasattr(model.model, 'norm') and model.model.norm:
-        model.model.norm = model.model.norm.to(gpus[-1])
-    import copy
-    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
-
-    cache = {'mask': None}
-
-    class MoveModule(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self_module = module
-            self.dev = next(iter(self.module.parameters())).device
-        def forward(self, *inp, **kwargs):
-            inp = list(inp)
-            if inp[0].device != self.dev:
-                inp[0] = inp[0].to(self.dev)
-            if cache['mask'] is None or cache ['mask'].device != self.dev:
-                cache['mask'] = kwargs['attention_mask'].to(self.dev)
-            kwargs['attention_mask'] = cache['mask']
-            tmp = self.module(*inp, **kwargs)
-            return tmp
-
-    layers = model.model.layers
-    pergpu = math.ceil(len(layers) / len(gpus))
-    for i in range(len(layers)):
-        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
-
-    model.gpus = gpus
-
 def benchmark(model, input_ids, check=False):
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
     torch.cuda.synchronize()
@@ -402,7 +362,7 @@ def tmp(layer, inp, out):
             if cache['past']:
                 cache['past'][i] = None
         return tmp
-    for i, layer in enumerate(model.model.layers):
+    for i, layer in enumerate(model.transformer.h):
         layer.register_forward_hook(clear_past(i))
 
     print('Benchmarking ...')
@@ -498,11 +458,7 @@ def main(args):
             finetuned_p.data = (base_p.data + finetuned_p.data).clone()
 
     if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            gptj_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
+        model = model.to(DEV)
         if args.benchmark:
             input_ids = next(iter(dataloader))[0][:, :args.benchmark]
             benchmark(model, input_ids, check=args.check)
@@ -515,16 +471,15 @@ def main(args):
     )
     
     ppl = gptj_eval(model, testloader, DEV)
-    print('perpexity for model is', ppl)
+    print(ppl)
 
     if args.rank > 0:
         n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
         print("Number of params without low rank ", n_params)
         print("Number of params with low rank", n_params - num_params_saved_lr)
     if args.save:
-        # gptj_pack(model, quantizers, args.wbits, args.groupsize)
-        # torch.save(model.state_dict(), args.save) 
-        pass
+        gptj_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save) 
     return ppl
 
 if __name__ == '__main__':
@@ -613,29 +568,22 @@ def main(args):
         '--trits', action='store_true', default=False, 
         help='Whether to use trits'
     )
-    parser.add_argument(
-        '--rank', type=int, default=0,
-        help='The rank to use for decomposing each matrices'
-    )
     parser.add_argument('--act_order', type=str, default=False)
     
     args = parser.parse_args()
         
     results = PrettyTable()
-    results.field_names = ['Bits', 'wikitext2', 'ptb']
+    results.field_names = ['Bits', 'n_params', 'Time', 'wiki']
     for n_bits in [4, 3, 2]:
         ppls = []
-        for dataset in ['wikitext2', 'ptb']:
-            print(n_bits)
-            print(dataset)
+        for dataset in ['wikitext2', 'ptb', 'c4']:
             args.dataset = dataset
             args.wbits = n_bits
             args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits)
             ppl = main(args)
             ppls.append(ppl)
-            print(ppl)
-        results.add_row([n_bits] + ppls)
+        results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]])
         print(results)
         with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f:
             f.write(str(results))
-    print('finished.')
+    print('finished.')
\ No newline at end of file
diff --git a/llama_delta.py b/llama_delta.py
new file mode 100644
index 0000000..d8fa99a
--- /dev/null
+++ b/llama_delta.py
@@ -0,0 +1,441 @@
+import time
+
+import torch
+import torch.nn as nn
+
+from gptq import *
+from modelutils import *
+from quant import *
+import copy
+import os
+
+def hard_threshold(x, fraction_of_zero=0.1):
+    y, _ = torch.sort(x.view(-1).abs().clone())
+    num_params = torch.numel(x)
+    thresh_index = int(num_params * fraction_of_zero)
+    threshold = y[thresh_index]
+    mask = x.abs().clone().gt(threshold).type(torch.FloatTensor)
+    return mask * x
+
+def get_llama(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = 2048
+    return model
+
+@torch.no_grad()
+def llama_sequential_delta(model, delta_model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+
+    layers = model.model.layers
+    delta_layers = delta_model.model.layers
+    
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    original_outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+    
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(delta_layers)):
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+
+        if args.true_sequential:
+            sequential = [
+                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
+                ['self_attn.o_proj'],
+                ['mlp.up_proj', 'mlp.gate_proj'],
+                ['mlp.down_proj']
+            ]
+        else:
+            sequential = [list(full.keys())]
+       
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+                gptq[name].free()
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def llama_sequential(model, dataloader, dev):
+    print('Starting ...')
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    print('Ready.')
+
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+
+        if args.true_sequential:
+            sequential = [
+                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
+                ['self_attn.o_proj'],
+                ['mlp.up_proj', 'mlp.gate_proj'],
+                ['mlp.down_proj']
+            ]
+        else:
+            sequential = [list(full.keys())]
+       
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer = Quantizer()
+                gptq[name].quantizer.configure(
+                    args.wbits, perchannel=True, sym=args.sym, mse=False
+                )
+
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                print(i, name)
+                print('Quantizing ...')
+                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
+                gptq[name].free()
+
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+
+        layers[i] = layer.cpu()
+        del layer
+        del gptq 
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+
+    model.config.use_cache = use_cache
+    
+    return quantizers
+
+@torch.no_grad()
+def llama_eval(model, testenc, dev):
+    print('Evaluating ...')
+
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    layers[0] = layers[0].to(dev)
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+    cache = {'i': 0, 'attention_mask': None}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    torch.cuda.empty_cache()
+
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = Quantizer()
+                quantizer.configure(
+                    args.wbits, perchannel=True, sym=False, mse=False
+                )
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantize(
+                    W, quantizer.scale, quantizer.zero, quantizer.maxq
+                ).to(next(iter(layer.parameters())).dtype)
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+
+    if model.model.norm is not None:
+        model.model.norm = model.model.norm.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.norm is not None:
+            hidden_states = model.model.norm(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[
+            :, (i * model.seqlen):((i + 1) * model.seqlen)
+        ][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+
+    model.config.use_cache = use_cache
+
+
+if __name__ == '__main__':
+    import argparse
+    from datautils import *
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '--model', type=str, default='ausboss/llama-13b-supercot',
+        help='LlaMa model to load; pass location of hugginface converted checkpoint.'
+    )
+    parser.add_argument(
+        '--base-model', type=str, default='yahma/llama-13b-hf',
+        help='base LLAMA model to load'
+    )
+    parser.add_argument(
+        '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'],
+        help='Where to extract calibration data from.'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int, default=0, help='Seed for sampling the calibration data.'
+    )
+    parser.add_argument(
+        '--nsamples', type=int, default=128,
+        help='Number of calibration data samples.'
+    )
+    parser.add_argument(
+        '--percdamp', type=float, default=.01,
+        help='Percent of the average Hessian diagonal to use for dampening.'
+    )
+    parser.add_argument(
+        '--nearest', action='store_true',
+        help='Whether to run the RTN baseline.'
+    ) 
+    parser.add_argument(
+        '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16],
+        help='#bits to use for quantization; use 16 for evaluating base model.'
+    )
+    parser.add_argument(
+        '--groupsize', type=int, default=-1,
+        help='Groupsize to use for quantization; default uses full row.'
+    )
+    parser.add_argument(
+        '--sym', action='store_true',
+        help='Whether to perform symmetric quantization.'
+    )
+    parser.add_argument(
+        '--new-eval', action='store_true',
+        help='Whether to use the new PTB and C4 eval.'
+    )
+    parser.add_argument(
+        '--act-order', action='store_true',
+        help='Whether to apply the activation order GPTQ heuristic'
+    )
+    parser.add_argument(
+        '--true-sequential', action='store_true',
+        help='Whether to run in true sequential model.'
+    )
+    parser.add_argument(
+        '--sparsify_hard_threshold', action='store_true',
+        help='Whether to add sparsity'
+    )
+    parser.add_argument(
+        '--fraction_of_zero', type=float, default=0.99,
+        help='Sparsity ratio'
+    )
+    args = parser.parse_args()
+
+    base_model = get_llama(args.base_model)
+    model = get_llama(args.model)
+    model.eval()
+
+    dataloader, testloader = get_loaders(
+        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
+    )
+
+    original_finetuned_model = copy.deepcopy(model)
+    _ = llama_sequential_delta(original_finetuned_model, model, dataloader, DEV)
+    for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+        finetuned_p.data = (finetuned_p.data-base_p.data).clone()
+                
+    
+    for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()):
+        if args.sparsify_hard_threshold:
+            print('Hard Thresholding...')
+            W = finetuned_p.data
+            finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero)
+        finetuned_p.data = (base_p.data + finetuned_p.data).clone()
+            
+    datasets = ['wikitext2', 'ptb', 'c4'] 
+    if args.new_eval:
+      datasets = ['wikitext2', 'ptb-new', 'c4-new']
+    for dataset in datasets:
+        dataloader, testloader = get_loaders(
+            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
+        )
+        print(dataset)
+        llama_eval(model, testloader, DEV)
\ No newline at end of file

From c64f2760507fc3583ac53c37c521fe4727bd84be Mon Sep 17 00:00:00 2001
From: kumbong <kumbonghermann@gmail.com>
Date: Sun, 14 May 2023 07:15:22 +0000
Subject: [PATCH 09/12] exec

---
 evaluation.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaluation.sh b/evaluation.sh
index 92d2232..d995499 100755
--- a/evaluation.sh
+++ b/evaluation.sh
@@ -20,14 +20,14 @@ CUDA_VISIBLE_DEVICES=5 python3 -u llama_delta.py \
     --wbits 2 \
     --true-sequential --act-order --new-eval\
     --sparsify_hard_threshold \
-    --fraction_of_zero 0.9 > delta_2bits_sparse_099.txt &
+    --fraction_of_zero 0.9 > delta_2bits_sparse_09.txt &
 
 CUDA_VISIBLE_DEVICES=4 python3 -u llama_delta.py \
     --groupsize 1024 \
     --wbits 4 \
     --true-sequential --act-order --new-eval\
     --sparsify_hard_threshold \
-    --fraction_of_zero 0.99 > delta_4bits_sparse_09.txt &
+    --fraction_of_zero 0.99 > delta_4bits_sparse_099.txt &
 
 CUDA_VISIBLE_DEVICES=3 python3 -u llama_delta.py \
     --groupsize 1024 \

From e327308dc6b760a973536548d322301388833059 Mon Sep 17 00:00:00 2001
From: root <kumbonghermann@gmail.com>
Date: Sun, 14 May 2023 07:53:57 +0000
Subject: [PATCH 10/12] some results

---
 delta_2_bits.txt           | 665 ++++++++++++++++++++++++++++++
 delta_2bits_sparse_09.txt  | 800 ++++++++++++++++++++++++++++++++++++
 delta_2bits_sparse_099.txt | 803 ++++++++++++++++++++++++++++++++++++
 delta_4bits.txt            |  29 --
 delta_4bits_sparse_09.txt  | 804 ++++++++++++++++++++++++++++++++++++
 delta_4bits_sparse_099.txt | 808 +++++++++++++++++++++++++++++++++++++
 6 files changed, 3880 insertions(+), 29 deletions(-)
 create mode 100644 delta_2bits_sparse_09.txt
 create mode 100644 delta_4bits_sparse_099.txt

diff --git a/delta_2_bits.txt b/delta_2_bits.txt
index 41b76c4..f7647d2 100644
--- a/delta_2_bits.txt
+++ b/delta_2_bits.txt
@@ -30,3 +30,668 @@ Quantizing ...
 Quantizing ...
 2 self_attn.k_proj
 Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.183259963989258
+Downloading and preparing dataset ptb_text_only/penn_treebank (download: 5.68 MiB, generated: 5.72 MiB, post-processed: Unknown size, total: 11.40 MiB) to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f...
+Dataset ptb_text_only downloaded and prepared to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f. Subsequent calls will reuse this data.
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.479209899902344
+Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...
+Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.
+Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...
+Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.7892680168151855
diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt
new file mode 100644
index 0000000..ae0aeb4
--- /dev/null
+++ b/delta_2bits_sparse_09.txt
@@ -0,0 +1,800 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt
index cfb6519..02be0af 100644
--- a/delta_2bits_sparse_099.txt
+++ b/delta_2bits_sparse_099.txt
@@ -6,3 +6,806 @@ Quantizing ...
 Quantizing ...
 0 self_attn.q_proj
 Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
diff --git a/delta_4bits.txt b/delta_4bits.txt
index 39698f4..4cecc92 100644
--- a/delta_4bits.txt
+++ b/delta_4bits.txt
@@ -1,30 +1 @@
 Starting ...
-Ready.
-0 self_attn.k_proj
-Quantizing ...
-0 self_attn.v_proj
-Quantizing ...
-0 self_attn.q_proj
-Quantizing ...
-0 self_attn.o_proj
-Quantizing ...
-0 mlp.up_proj
-Quantizing ...
-0 mlp.gate_proj
-Quantizing ...
-0 mlp.down_proj
-Quantizing ...
-1 self_attn.k_proj
-Quantizing ...
-1 self_attn.v_proj
-Quantizing ...
-1 self_attn.q_proj
-Quantizing ...
-1 self_attn.o_proj
-Quantizing ...
-1 mlp.up_proj
-Quantizing ...
-1 mlp.gate_proj
-Quantizing ...
-1 mlp.down_proj
-Quantizing ...
diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt
index cfb6519..e173fbb 100644
--- a/delta_4bits_sparse_09.txt
+++ b/delta_4bits_sparse_09.txt
@@ -6,3 +6,807 @@ Quantizing ...
 Quantizing ...
 0 self_attn.q_proj
 Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt
new file mode 100644
index 0000000..80e3ba1
--- /dev/null
+++ b/delta_4bits_sparse_099.txt
@@ -0,0 +1,808 @@
+Starting ...
+Ready.
+0 self_attn.k_proj
+Quantizing ...
+0 self_attn.v_proj
+Quantizing ...
+0 self_attn.q_proj
+Quantizing ...
+0 self_attn.o_proj
+Quantizing ...
+0 mlp.up_proj
+Quantizing ...
+0 mlp.gate_proj
+Quantizing ...
+0 mlp.down_proj
+Quantizing ...
+1 self_attn.k_proj
+Quantizing ...
+1 self_attn.v_proj
+Quantizing ...
+1 self_attn.q_proj
+Quantizing ...
+1 self_attn.o_proj
+Quantizing ...
+1 mlp.up_proj
+Quantizing ...
+1 mlp.gate_proj
+Quantizing ...
+1 mlp.down_proj
+Quantizing ...
+2 self_attn.k_proj
+Quantizing ...
+2 self_attn.v_proj
+Quantizing ...
+2 self_attn.q_proj
+Quantizing ...
+2 self_attn.o_proj
+Quantizing ...
+2 mlp.up_proj
+Quantizing ...
+2 mlp.gate_proj
+Quantizing ...
+2 mlp.down_proj
+Quantizing ...
+3 self_attn.k_proj
+Quantizing ...
+3 self_attn.v_proj
+Quantizing ...
+3 self_attn.q_proj
+Quantizing ...
+3 self_attn.o_proj
+Quantizing ...
+3 mlp.up_proj
+Quantizing ...
+3 mlp.gate_proj
+Quantizing ...
+3 mlp.down_proj
+Quantizing ...
+4 self_attn.k_proj
+Quantizing ...
+4 self_attn.v_proj
+Quantizing ...
+4 self_attn.q_proj
+Quantizing ...
+4 self_attn.o_proj
+Quantizing ...
+4 mlp.up_proj
+Quantizing ...
+4 mlp.gate_proj
+Quantizing ...
+4 mlp.down_proj
+Quantizing ...
+5 self_attn.k_proj
+Quantizing ...
+5 self_attn.v_proj
+Quantizing ...
+5 self_attn.q_proj
+Quantizing ...
+5 self_attn.o_proj
+Quantizing ...
+5 mlp.up_proj
+Quantizing ...
+5 mlp.gate_proj
+Quantizing ...
+5 mlp.down_proj
+Quantizing ...
+6 self_attn.k_proj
+Quantizing ...
+6 self_attn.v_proj
+Quantizing ...
+6 self_attn.q_proj
+Quantizing ...
+6 self_attn.o_proj
+Quantizing ...
+6 mlp.up_proj
+Quantizing ...
+6 mlp.gate_proj
+Quantizing ...
+6 mlp.down_proj
+Quantizing ...
+7 self_attn.k_proj
+Quantizing ...
+7 self_attn.v_proj
+Quantizing ...
+7 self_attn.q_proj
+Quantizing ...
+7 self_attn.o_proj
+Quantizing ...
+7 mlp.up_proj
+Quantizing ...
+7 mlp.gate_proj
+Quantizing ...
+7 mlp.down_proj
+Quantizing ...
+8 self_attn.k_proj
+Quantizing ...
+8 self_attn.v_proj
+Quantizing ...
+8 self_attn.q_proj
+Quantizing ...
+8 self_attn.o_proj
+Quantizing ...
+8 mlp.up_proj
+Quantizing ...
+8 mlp.gate_proj
+Quantizing ...
+8 mlp.down_proj
+Quantizing ...
+9 self_attn.k_proj
+Quantizing ...
+9 self_attn.v_proj
+Quantizing ...
+9 self_attn.q_proj
+Quantizing ...
+9 self_attn.o_proj
+Quantizing ...
+9 mlp.up_proj
+Quantizing ...
+9 mlp.gate_proj
+Quantizing ...
+9 mlp.down_proj
+Quantizing ...
+10 self_attn.k_proj
+Quantizing ...
+10 self_attn.v_proj
+Quantizing ...
+10 self_attn.q_proj
+Quantizing ...
+10 self_attn.o_proj
+Quantizing ...
+10 mlp.up_proj
+Quantizing ...
+10 mlp.gate_proj
+Quantizing ...
+10 mlp.down_proj
+Quantizing ...
+11 self_attn.k_proj
+Quantizing ...
+11 self_attn.v_proj
+Quantizing ...
+11 self_attn.q_proj
+Quantizing ...
+11 self_attn.o_proj
+Quantizing ...
+11 mlp.up_proj
+Quantizing ...
+11 mlp.gate_proj
+Quantizing ...
+11 mlp.down_proj
+Quantizing ...
+12 self_attn.k_proj
+Quantizing ...
+12 self_attn.v_proj
+Quantizing ...
+12 self_attn.q_proj
+Quantizing ...
+12 self_attn.o_proj
+Quantizing ...
+12 mlp.up_proj
+Quantizing ...
+12 mlp.gate_proj
+Quantizing ...
+12 mlp.down_proj
+Quantizing ...
+13 self_attn.k_proj
+Quantizing ...
+13 self_attn.v_proj
+Quantizing ...
+13 self_attn.q_proj
+Quantizing ...
+13 self_attn.o_proj
+Quantizing ...
+13 mlp.up_proj
+Quantizing ...
+13 mlp.gate_proj
+Quantizing ...
+13 mlp.down_proj
+Quantizing ...
+14 self_attn.k_proj
+Quantizing ...
+14 self_attn.v_proj
+Quantizing ...
+14 self_attn.q_proj
+Quantizing ...
+14 self_attn.o_proj
+Quantizing ...
+14 mlp.up_proj
+Quantizing ...
+14 mlp.gate_proj
+Quantizing ...
+14 mlp.down_proj
+Quantizing ...
+15 self_attn.k_proj
+Quantizing ...
+15 self_attn.v_proj
+Quantizing ...
+15 self_attn.q_proj
+Quantizing ...
+15 self_attn.o_proj
+Quantizing ...
+15 mlp.up_proj
+Quantizing ...
+15 mlp.gate_proj
+Quantizing ...
+15 mlp.down_proj
+Quantizing ...
+16 self_attn.k_proj
+Quantizing ...
+16 self_attn.v_proj
+Quantizing ...
+16 self_attn.q_proj
+Quantizing ...
+16 self_attn.o_proj
+Quantizing ...
+16 mlp.up_proj
+Quantizing ...
+16 mlp.gate_proj
+Quantizing ...
+16 mlp.down_proj
+Quantizing ...
+17 self_attn.k_proj
+Quantizing ...
+17 self_attn.v_proj
+Quantizing ...
+17 self_attn.q_proj
+Quantizing ...
+17 self_attn.o_proj
+Quantizing ...
+17 mlp.up_proj
+Quantizing ...
+17 mlp.gate_proj
+Quantizing ...
+17 mlp.down_proj
+Quantizing ...
+18 self_attn.k_proj
+Quantizing ...
+18 self_attn.v_proj
+Quantizing ...
+18 self_attn.q_proj
+Quantizing ...
+18 self_attn.o_proj
+Quantizing ...
+18 mlp.up_proj
+Quantizing ...
+18 mlp.gate_proj
+Quantizing ...
+18 mlp.down_proj
+Quantizing ...
+19 self_attn.k_proj
+Quantizing ...
+19 self_attn.v_proj
+Quantizing ...
+19 self_attn.q_proj
+Quantizing ...
+19 self_attn.o_proj
+Quantizing ...
+19 mlp.up_proj
+Quantizing ...
+19 mlp.gate_proj
+Quantizing ...
+19 mlp.down_proj
+Quantizing ...
+20 self_attn.k_proj
+Quantizing ...
+20 self_attn.v_proj
+Quantizing ...
+20 self_attn.q_proj
+Quantizing ...
+20 self_attn.o_proj
+Quantizing ...
+20 mlp.up_proj
+Quantizing ...
+20 mlp.gate_proj
+Quantizing ...
+20 mlp.down_proj
+Quantizing ...
+21 self_attn.k_proj
+Quantizing ...
+21 self_attn.v_proj
+Quantizing ...
+21 self_attn.q_proj
+Quantizing ...
+21 self_attn.o_proj
+Quantizing ...
+21 mlp.up_proj
+Quantizing ...
+21 mlp.gate_proj
+Quantizing ...
+21 mlp.down_proj
+Quantizing ...
+22 self_attn.k_proj
+Quantizing ...
+22 self_attn.v_proj
+Quantizing ...
+22 self_attn.q_proj
+Quantizing ...
+22 self_attn.o_proj
+Quantizing ...
+22 mlp.up_proj
+Quantizing ...
+22 mlp.gate_proj
+Quantizing ...
+22 mlp.down_proj
+Quantizing ...
+23 self_attn.k_proj
+Quantizing ...
+23 self_attn.v_proj
+Quantizing ...
+23 self_attn.q_proj
+Quantizing ...
+23 self_attn.o_proj
+Quantizing ...
+23 mlp.up_proj
+Quantizing ...
+23 mlp.gate_proj
+Quantizing ...
+23 mlp.down_proj
+Quantizing ...
+24 self_attn.k_proj
+Quantizing ...
+24 self_attn.v_proj
+Quantizing ...
+24 self_attn.q_proj
+Quantizing ...
+24 self_attn.o_proj
+Quantizing ...
+24 mlp.up_proj
+Quantizing ...
+24 mlp.gate_proj
+Quantizing ...
+24 mlp.down_proj
+Quantizing ...
+25 self_attn.k_proj
+Quantizing ...
+25 self_attn.v_proj
+Quantizing ...
+25 self_attn.q_proj
+Quantizing ...
+25 self_attn.o_proj
+Quantizing ...
+25 mlp.up_proj
+Quantizing ...
+25 mlp.gate_proj
+Quantizing ...
+25 mlp.down_proj
+Quantizing ...
+26 self_attn.k_proj
+Quantizing ...
+26 self_attn.v_proj
+Quantizing ...
+26 self_attn.q_proj
+Quantizing ...
+26 self_attn.o_proj
+Quantizing ...
+26 mlp.up_proj
+Quantizing ...
+26 mlp.gate_proj
+Quantizing ...
+26 mlp.down_proj
+Quantizing ...
+27 self_attn.k_proj
+Quantizing ...
+27 self_attn.v_proj
+Quantizing ...
+27 self_attn.q_proj
+Quantizing ...
+27 self_attn.o_proj
+Quantizing ...
+27 mlp.up_proj
+Quantizing ...
+27 mlp.gate_proj
+Quantizing ...
+27 mlp.down_proj
+Quantizing ...
+28 self_attn.k_proj
+Quantizing ...
+28 self_attn.v_proj
+Quantizing ...
+28 self_attn.q_proj
+Quantizing ...
+28 self_attn.o_proj
+Quantizing ...
+28 mlp.up_proj
+Quantizing ...
+28 mlp.gate_proj
+Quantizing ...
+28 mlp.down_proj
+Quantizing ...
+29 self_attn.k_proj
+Quantizing ...
+29 self_attn.v_proj
+Quantizing ...
+29 self_attn.q_proj
+Quantizing ...
+29 self_attn.o_proj
+Quantizing ...
+29 mlp.up_proj
+Quantizing ...
+29 mlp.gate_proj
+Quantizing ...
+29 mlp.down_proj
+Quantizing ...
+30 self_attn.k_proj
+Quantizing ...
+30 self_attn.v_proj
+Quantizing ...
+30 self_attn.q_proj
+Quantizing ...
+30 self_attn.o_proj
+Quantizing ...
+30 mlp.up_proj
+Quantizing ...
+30 mlp.gate_proj
+Quantizing ...
+30 mlp.down_proj
+Quantizing ...
+31 self_attn.k_proj
+Quantizing ...
+31 self_attn.v_proj
+Quantizing ...
+31 self_attn.q_proj
+Quantizing ...
+31 self_attn.o_proj
+Quantizing ...
+31 mlp.up_proj
+Quantizing ...
+31 mlp.gate_proj
+Quantizing ...
+31 mlp.down_proj
+Quantizing ...
+32 self_attn.k_proj
+Quantizing ...
+32 self_attn.v_proj
+Quantizing ...
+32 self_attn.q_proj
+Quantizing ...
+32 self_attn.o_proj
+Quantizing ...
+32 mlp.up_proj
+Quantizing ...
+32 mlp.gate_proj
+Quantizing ...
+32 mlp.down_proj
+Quantizing ...
+33 self_attn.k_proj
+Quantizing ...
+33 self_attn.v_proj
+Quantizing ...
+33 self_attn.q_proj
+Quantizing ...
+33 self_attn.o_proj
+Quantizing ...
+33 mlp.up_proj
+Quantizing ...
+33 mlp.gate_proj
+Quantizing ...
+33 mlp.down_proj
+Quantizing ...
+34 self_attn.k_proj
+Quantizing ...
+34 self_attn.v_proj
+Quantizing ...
+34 self_attn.q_proj
+Quantizing ...
+34 self_attn.o_proj
+Quantizing ...
+34 mlp.up_proj
+Quantizing ...
+34 mlp.gate_proj
+Quantizing ...
+34 mlp.down_proj
+Quantizing ...
+35 self_attn.k_proj
+Quantizing ...
+35 self_attn.v_proj
+Quantizing ...
+35 self_attn.q_proj
+Quantizing ...
+35 self_attn.o_proj
+Quantizing ...
+35 mlp.up_proj
+Quantizing ...
+35 mlp.gate_proj
+Quantizing ...
+35 mlp.down_proj
+Quantizing ...
+36 self_attn.k_proj
+Quantizing ...
+36 self_attn.v_proj
+Quantizing ...
+36 self_attn.q_proj
+Quantizing ...
+36 self_attn.o_proj
+Quantizing ...
+36 mlp.up_proj
+Quantizing ...
+36 mlp.gate_proj
+Quantizing ...
+36 mlp.down_proj
+Quantizing ...
+37 self_attn.k_proj
+Quantizing ...
+37 self_attn.v_proj
+Quantizing ...
+37 self_attn.q_proj
+Quantizing ...
+37 self_attn.o_proj
+Quantizing ...
+37 mlp.up_proj
+Quantizing ...
+37 mlp.gate_proj
+Quantizing ...
+37 mlp.down_proj
+Quantizing ...
+38 self_attn.k_proj
+Quantizing ...
+38 self_attn.v_proj
+Quantizing ...
+38 self_attn.q_proj
+Quantizing ...
+38 self_attn.o_proj
+Quantizing ...
+38 mlp.up_proj
+Quantizing ...
+38 mlp.gate_proj
+Quantizing ...
+38 mlp.down_proj
+Quantizing ...
+39 self_attn.k_proj
+Quantizing ...
+39 self_attn.v_proj
+Quantizing ...
+39 self_attn.q_proj
+Quantizing ...
+39 self_attn.o_proj
+Quantizing ...
+39 mlp.up_proj
+Quantizing ...
+39 mlp.gate_proj
+Quantizing ...
+39 mlp.down_proj
+Quantizing ...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...

From 8d118a7afbeeabd6b78b65305ea8ca976e16cd3e Mon Sep 17 00:00:00 2001
From: root <kumbonghermann@gmail.com>
Date: Sun, 14 May 2023 08:22:52 +0000
Subject: [PATCH 11/12] some results

---
 delta_2bits_sparse_09.txt  | 236 +++++++++++++++++++++++++++++++++++++
 delta_2bits_sparse_099.txt | 224 +++++++++++++++++++++++++++++++++++
 delta_4bits_sparse_09.txt  | 226 +++++++++++++++++++++++++++++++++++
 delta_4bits_sparse_099.txt | 226 +++++++++++++++++++++++++++++++++++
 4 files changed, 912 insertions(+)

diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt
index ae0aeb4..0b50db2 100644
--- a/delta_2bits_sparse_09.txt
+++ b/delta_2bits_sparse_09.txt
@@ -798,3 +798,239 @@ Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.098198890686035
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.15268325805664
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt
index 02be0af..9ec985c 100644
--- a/delta_2bits_sparse_099.txt
+++ b/delta_2bits_sparse_099.txt
@@ -809,3 +809,227 @@ Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.087564945220947
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.305665969848633
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt
index e173fbb..581b389 100644
--- a/delta_4bits_sparse_09.txt
+++ b/delta_4bits_sparse_09.txt
@@ -810,3 +810,229 @@ Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.098198890686035
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.15268325805664
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt
index 80e3ba1..8fb4393 100644
--- a/delta_4bits_sparse_099.txt
+++ b/delta_4bits_sparse_099.txt
@@ -806,3 +806,229 @@ Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
 Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+Hard Thresholding...
+wikitext2
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+5.087564945220947
+ptb-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+19.305665969848633
+c4-new
+Evaluating ...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20

From 559eef86554c83d669a6a72ebc89feda2663036f Mon Sep 17 00:00:00 2001
From: root <kumbonghermann@gmail.com>
Date: Sun, 14 May 2023 08:31:55 +0000
Subject: [PATCH 12/12] even more results

---
 delta_2bits_sparse_09.txt  | 18 ++++++++++++++++++
 delta_2bits_sparse_099.txt | 19 +++++++++++++++++++
 delta_4bits_sparse_09.txt  | 16 ++++++++++++++++
 delta_4bits_sparse_099.txt | 20 ++++++++++++++++++++
 4 files changed, 73 insertions(+)

diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt
index 0b50db2..8089e8e 100644
--- a/delta_2bits_sparse_09.txt
+++ b/delta_2bits_sparse_09.txt
@@ -1034,3 +1034,21 @@ Evaluating ...
 20
 21
 22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.6517863273620605
diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt
index 9ec985c..a7ba5b7 100644
--- a/delta_2bits_sparse_099.txt
+++ b/delta_2bits_sparse_099.txt
@@ -1033,3 +1033,22 @@ Evaluating ...
 19
 20
 21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.614907741546631
diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt
index 581b389..8089e8e 100644
--- a/delta_4bits_sparse_09.txt
+++ b/delta_4bits_sparse_09.txt
@@ -1036,3 +1036,19 @@ Evaluating ...
 22
 23
 24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.6517863273620605
diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt
index 8fb4393..a7ba5b7 100644
--- a/delta_4bits_sparse_099.txt
+++ b/delta_4bits_sparse_099.txt
@@ -1032,3 +1032,23 @@ Evaluating ...
 18
 19
 20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+6.614907741546631