From 9ac47eaa5d1d98b09f7b04cfc758ceda7d735f90 Mon Sep 17 00:00:00 2001 From: ningwanyi Date: Thu, 11 May 2023 18:55:57 +0000 Subject: [PATCH 01/12] some evaluation --- benchmark.py | 2 + fmzip.py | 0 gptj.py | 0 opt.py | 104 +++++++++++++++++++++++++++++---------------------- opt_delta.py | 24 ++++++++---- 5 files changed, 79 insertions(+), 51 deletions(-) create mode 100644 benchmark.py create mode 100644 fmzip.py create mode 100644 gptj.py diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..acf2ea0 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,2 @@ +wbits = [2, 3, 4, 8] +sparsity = [0.0, 0.5, 0.9] \ No newline at end of file diff --git a/fmzip.py b/fmzip.py new file mode 100644 index 0000000..e69de29 diff --git a/gptj.py b/gptj.py new file mode 100644 index 0000000..e69de29 diff --git a/opt.py b/opt.py index edf40bc..77f0f9b 100644 --- a/opt.py +++ b/opt.py @@ -6,7 +6,7 @@ from gptq import * from modelutils import * from quant import quantize, Quantizer, Quant3Linear, make_quant3 - +from prettytable import PrettyTable def get_opt(model): import torch def skip(*args, **kwargs): @@ -223,6 +223,7 @@ def forward(self, inp, **kwargs): print(ppl.item()) model.config.use_cache = use_cache + return ppl.item() # TODO: perform packing on GPU def opt_pack3(model, quantizers): @@ -351,6 +352,48 @@ def sync(): print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) +def main(args): + if args.load: + model = load_quant3(args.model, args.load) + else: + model = get_opt(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = opt_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + opt_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + + dataloader, testloader = get_loaders( + args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + ppl = opt_eval(model, testloader, DEV) + + if args.save: + opt_pack3(model, quantizers) + torch.save(model.state_dict(), args.save) + + return ppl + + if __name__ == '__main__': import argparse from datautils import * @@ -358,11 +401,11 @@ def sync(): parser = argparse.ArgumentParser() parser.add_argument( - 'model', type=str, + '--model', type=str, default='lnair/opt-1.3b-wikitext2', help='OPT model to load; pass `facebook/opt-X`.' ) parser.add_argument( - 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.' ) parser.add_argument( @@ -427,44 +470,17 @@ def sync(): ) args = parser.parse_args() - - if args.load: - model = load_quant3(args.model, args.load) - else: - model = get_opt(args.model) - model.eval() - - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - - if args.wbits < 16 and not args.nearest: - tick = time.time() - quantizers = opt_sequential(model, dataloader, DEV) - print(time.time() - tick) - - if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - opt_multigpu(model, gpus) - else: - model = model.to(DEV) - if args.benchmark: - input_ids = next(iter(dataloader))[0][:, :args.benchmark] - benchmark(model, input_ids, check=args.check) - if args.load: - exit() - - datasets = ['wikitext2', 'ptb', 'c4'] - if args.new_eval: - datasets = ['wikitext2', 'ptb-new', 'c4-new'] - for dataset in datasets: - dataloader, testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - print(dataset) - opt_eval(model, testloader, DEV) - - if args.save: - opt_pack3(model, quantizers) - torch.save(model.state_dict(), args.save) + + results = PrettyTable() + results.field_names = ['Bits', 'wiki', 'ptb', 'c4'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, ppls[0], ppls[1], ppls[2]]) + print(results) + print('finished.') diff --git a/opt_delta.py b/opt_delta.py index f9f6bbf..82371a4 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -7,7 +7,7 @@ from gptq import * from modelutils import * from quant import * - +from prettytable import PrettyTable import copy #from prettytable import PrettyTable @@ -548,7 +548,9 @@ def main(args): if args.save: opt_pack3(model, quantizers) torch.save(model.state_dict(), args.save) - + + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + return ppl, n_params, comp_time if __name__ == '__main__': import argparse @@ -601,7 +603,7 @@ def main(args): help='Whether to perform symmetric quantization.' ) parser.add_argument( - '--save', type=str, default='', + '--save', type=str, default='opt-1.3b-wikitext2-wbits2.pt', help='Save quantized checkpoint under this name.' ) parser.add_argument( @@ -647,8 +649,16 @@ def main(args): ) args = parser.parse_args() - #results = PrettyTable() - - main(args) - + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki', 'ptb', 'c4'] + for n_bits in [2, 3, 4]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'opt-not-delta1.3b-%s-wbits%d.pt' % (dataset, n_bits) + ppl, n_params, comp_time = main(args) + ppls.append(ppl) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(results) print('finished.') From eca0c2e3215455d047d4c4bb5e98c70d632fd7c8 Mon Sep 17 00:00:00 2001 From: kumbong Date: Fri, 12 May 2023 08:38:37 +0000 Subject: [PATCH 02/12] experiment scripts + gptj --- .gitignore | 5 +- evaluation.sh | 52 +++++ file_0.txt | 7 + file_1.txt | 7 + file_2.txt | 7 + file_3.txt | 7 + file_4.txt | 7 + file_5.txt | 7 + gptj.py | 492 +++++++++++++++++++++++++++++++++++++++++++ jt_datautils/cot.py | 105 +++++++++ jt_datautils/pile.py | 77 +++++++ opt_delta.py | 30 +-- requirements.txt | 8 +- src/fmzip | 1 + 14 files changed, 798 insertions(+), 14 deletions(-) create mode 100755 evaluation.sh create mode 100644 file_0.txt create mode 100644 file_1.txt create mode 100644 file_2.txt create mode 100644 file_3.txt create mode 100644 file_4.txt create mode 100644 file_5.txt create mode 100644 jt_datautils/cot.py create mode 100644 jt_datautils/pile.py create mode 160000 src/fmzip diff --git a/.gitignore b/.gitignore index dbd6338..761a9f6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,7 @@ dist/ .idea *.egg-info/ *.safetensors -outputs/ \ No newline at end of file +outputs/ +.cache/ +data/ +results/ \ No newline at end of file diff --git a/evaluation.sh b/evaluation.sh new file mode 100755 index 0000000..aaea40f --- /dev/null +++ b/evaluation.sh @@ -0,0 +1,52 @@ +CUDA_VISIBLE_DEVICES=0 python3 opt_delta.py \ + --groupsize 1024 \ + --delta \ + --benchmark_results "file_0.txt" \ +& +CUDA_VISIBLE_DEVICES=1 python3 opt_delta.py \ + --groupsize 1024 \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.9 \ + --delta \ + --benchmark_results "file_1.txt" \ +& +CUDA_VISIBLE_DEVICES=2 python3 opt_delta.py \ + --groupsize 1024 \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 \ + --delta \ + --benchmark_results "file_2.txt" \ +& +CUDA_VISIBLE_DEVICES=3 python3 opt_delta.py \ + --groupsize 1024 \ + --delta \ + --rank 32 \ + --benchmark_results "file_3.txt" \ +& +CUDA_VISIBLE_DEVICES=4 python3 opt_delta.py \ + --groupsize 1024 \ + --delta \ + --rank 16 \ + --benchmark_results "file_4.txt" \ +& +CUDA_VISIBLE_DEVICES=5 python3 opt_delta.py \ + --groupsize 1024 \ + --delta \ + --rank 64 \ + --benchmark_results "file_5.txt" \ +& +CUDA_VISIBLE_DEVICES=6 python3 opt_delta.py \ + --groupsize 1024 \ + --delta \ + --rank 32 \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.9 \ + --benchmark_results "file_6.txt" \ +& +CUDA_VISIBLE_DEVICES=7 python3 opt_delta.py \ + --groupsize 1024 \ + --delta \ + --rank 32 \ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 \ + --benchmark_results "file_7.txt" \ No newline at end of file diff --git a/file_0.txt b/file_0.txt new file mode 100644 index 0000000..7fd8537 --- /dev/null +++ b/file_0.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+-------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+-------------------+--------------------+ +| 2 | 107356160 | 315.56584095954895 | 12.70229721069336 | 18.99186134338379 | 16.049821853637695 | +| 3 | 107356160 | 254.49543404579163 | 12.98267936706543 | 19.62110710144043 | 16.652606964111328 | +| 4 | 107356160 | 285.25878047943115 | 12.996271133422852 | 19.65008544921875 | 16.664426803588867 | ++------+-----------+--------------------+--------------------+-------------------+--------------------+ \ No newline at end of file diff --git a/file_1.txt b/file_1.txt new file mode 100644 index 0000000..1af604f --- /dev/null +++ b/file_1.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 288.47421526908875 | 12.124371528625488 | 17.396339416503906 | 15.110072135925293 | +| 3 | 107356160 | 310.34640645980835 | 12.246709823608398 | 17.316566467285156 | 14.97178840637207 | +| 4 | 107356160 | 262.9206793308258 | 12.252873420715332 | 17.329992294311523 | 14.979094505310059 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_2.txt b/file_2.txt new file mode 100644 index 0000000..4ca10d2 --- /dev/null +++ b/file_2.txt @@ -0,0 +1,7 @@ ++------+-----------+-------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+-------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 289.8132817745209 | 13.843452453613281 | 16.968669891357422 | 14.779077529907227 | +| 3 | 107356160 | 307.7978012561798 | 13.91087532043457 | 16.95600700378418 | 14.742414474487305 | +| 4 | 107356160 | 262.0493402481079 | 13.913723945617676 | 16.955684661865234 | 14.743617057800293 | ++------+-----------+-------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_3.txt b/file_3.txt new file mode 100644 index 0000000..672862a --- /dev/null +++ b/file_3.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 283.91542887687683 | 12.507635116577148 | 18.553525924682617 | 15.613986015319824 | +| 3 | 107356160 | 287.85402369499207 | 12.571398735046387 | 18.915355682373047 | 15.952068328857422 | +| 4 | 107356160 | 279.67540669441223 | 12.590620040893555 | 18.968795776367188 | 15.981791496276855 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_4.txt b/file_4.txt new file mode 100644 index 0000000..e2c3608 --- /dev/null +++ b/file_4.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 274.58040595054626 | 12.96647834777832 | 18.44032859802246 | 15.488606452941895 | +| 3 | 107356160 | 277.05651092529297 | 12.934049606323242 | 18.722591400146484 | 15.750381469726562 | +| 4 | 107356160 | 282.69956731796265 | 12.932695388793945 | 18.789344787597656 | 15.76345443725586 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/file_5.txt b/file_5.txt new file mode 100644 index 0000000..2655a74 --- /dev/null +++ b/file_5.txt @@ -0,0 +1,7 @@ ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| Bits | n_params | Time | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ +| 2 | 107356160 | 262.34580183029175 | 12.373908996582031 | 18.664175033569336 | 15.718718528747559 | +| 3 | 107356160 | 271.34020018577576 | 12.558426856994629 | 19.13666343688965 | 16.14783477783203 | +| 4 | 107356160 | 255.5096390247345 | 12.59843921661377 | 19.159931182861328 | 16.166603088378906 | ++------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/gptj.py b/gptj.py index e69de29..a4e34e7 100644 --- a/gptj.py +++ b/gptj.py @@ -0,0 +1,492 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import quantize, Quantizer, Quant3Linear, make_quant3 +from prettytable import PrettyTable +from transformers import GPTJForCausalLM + +def get_opt(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + model = GPTJForCausalLM.from_pretrained( + model, + revision="float16", + torch_dtype=torch.float16, + ).to("cuda") + model.seqlen = model.config.max_position_embeddings + return model + +@torch.no_grad() +def opt_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def opt_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.decoder.final_layer_norm is not None: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) + if model.model.decoder.project_out is not None: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.decoder.final_layer_norm is not None: + hidden_states = model.model.decoder.final_layer_norm(hidden_states) + if model.model.decoder.project_out is not None: + hidden_states = model.model.decoder.project_out(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + return ppl.item() + +# TODO: perform packing on GPU +def opt_pack3(model, quantizers): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant3(model, quantizers, faster=args.faster_kernel) + qlayers = find_layers(model, [Quant3Linear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name] = quantizers[name].cpu() + qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) + print('Done.') + return model + +def load_quant3(model, checkpoint): + from transformers import OPTConfig, OPTForCausalLM + config = OPTConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = OPTForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']: + if name in layers: + del layers[name] + make_quant3(model, layers, faster=args.faster_kernel) + + print('Loading model ...') + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = model.config.max_position_embeddings + print('Done.') + + return model + +def opt_multigpu(model, gpus): + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0]) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0]) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0]) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1]) + if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1]) + import copy + model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) + + cache = {'mask': None} + + class MoveModule(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + self.dev = next(iter(self.module.parameters())).device + def forward(self, *inp, **kwargs): + inp = list(inp) + if inp[0].device != self.dev: + inp[0] = inp[0].to(self.dev) + if cache['mask'] is None or cache['mask'].device != self.dev: + cache['mask'] = kwargs['attention_mask'].to(self.dev) + kwargs['attention_mask'] = cache['mask'] + tmp = self.module(*inp, **kwargs) + return tmp + + layers = model.model.decoder.layers + pergpu = math.ceil(len(layers) / len(gpus)) + for i in range(len(layers)): + layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) + + model.gpus = gpus + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.model.decoder.layers): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + out = model( + input_ids[:, i].reshape(-1), + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + if check and i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_key_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + if check: + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + + +def main(args): + if args.load: + model = load_quant3(args.model, args.load) + else: + model = get_opt(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = opt_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + opt_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + + dataloader, testloader = get_loaders( + args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + ppl = opt_eval(model, testloader, DEV) + + if args.save: + opt_pack3(model, quantizers) + torch.save(model.state_dict(), args.save) + + return ppl + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='lnair/opt-1.3b-wikitext2', + help='OPT model to load; pass `facebook/opt-X`.' + ) + parser.add_argument( + '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--trits', action='store_true', + help='Whether to use trits for quantization.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save quantized checkpoint under this name.' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load quantized model.' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perplexity during benchmarking for verification.' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval.' + ) + parser.add_argument( + '--faster-kernel', action='store_true', + help='Whether to use the new faster kernel for benchmarking.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + + args = parser.parse_args() + + get_opt("EleutherAI/gpt-j-6B") + # results = PrettyTable() + # results.field_names = ['Bits', 'wiki', 'ptb', 'c4'] + # for n_bits in [4, 3, 2]: + # ppls = [] + # for dataset in ['wikitext2', 'ptb', 'c4']: + # args.dataset = dataset + # args.wbits = n_bits + # args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits) + # ppl = main(args) + # ppls.append(ppl) + # results.add_row([n_bits, ppls[0], ppls[1], ppls[2]]) + # print(results) + print('finished.') diff --git a/jt_datautils/cot.py b/jt_datautils/cot.py new file mode 100644 index 0000000..15796e9 --- /dev/null +++ b/jt_datautils/cot.py @@ -0,0 +1,105 @@ +import os +import re +import torch +import json +from torch.utils.data import IterableDataset, DataLoader +from itertools import cycle, islice +import random +from datasets import Dataset +from datasets import load_dataset, load_from_disk +#from comm.comm_utils import * + + + +class StreamDataset(IterableDataset): + def __init__(self, cot_data_path, tokenizer, seq_length=1024): + + self.cot_data_path = cot_data_path + + with open(cot_data_path) as f: + self.cot_data = json.load(f) + + self.buffer_tokens = [] + + self.tokenizer = tokenizer + self.seq_length = seq_length + + self.it = None + + def state_dict(self): + return {} + + def load_state_dict(self, state_dict): + pass + + def get_sequence_from_cot(self): + + while True: + + keys = list(self.cot_data.keys()) + random.shuffle(keys) + + input_ids = [] + + for k in keys: + + v = self.cot_data[k] + + input_ids += self.tokenizer(v + '\n\n')['input_ids'] + if len(input_ids) < self.seq_length: + continue + # input_ids += [self.tokenizer.eos_token_id]*(self.seq_length - len(input_ids)) + + input_ids = input_ids[:self.seq_length] + input_ids = torch.tensor(input_ids).long() + + yield input_ids + + input_ids = [] + + def get_sequence(self): + + it_cot = cycle(self.get_sequence_from_cot()) + + while True: + + input_ids = next(it_cot) + + + yield { + 'input_ids': input_ids, + } + + + def get_stream(self): + return cycle(self.get_sequence()) + + def __iter__(self): + if self.it is None: + self.it = self.get_stream() + return self.it + + + +def get_cot_train_data_loader(args, tokenizer, num_workers=0, state_dict=None): + + stream_dataset = StreamDataset( + './data/mmlu-cot.json', + tokenizer=tokenizer, seq_length=args.seq_length + ) + + if state_dict is not None: + stream_dataset.load_state_dict(state_dict) + + train_data_loader = torch.utils.data.DataLoader(stream_dataset, + batch_size=args.batch_size * args.data_group_size, + shuffle=False, + num_workers=num_workers, + pin_memory=True, + collate_fn=None) + return train_data_loader + +def get_cot_ds(data_path, tokenizer, seq_length): + return StreamDataset(os.path.join(data_path,'mmlu-cot.json'), + tokenizer=tokenizer, seq_length=seq_length + ) \ No newline at end of file diff --git a/jt_datautils/pile.py b/jt_datautils/pile.py new file mode 100644 index 0000000..fbddca5 --- /dev/null +++ b/jt_datautils/pile.py @@ -0,0 +1,77 @@ + +import os +import re +import torch +from torch.utils.data import IterableDataset, DataLoader +from itertools import cycle, islice +import random +from datasets import Dataset +from datasets import load_dataset, load_from_disk +# from comm.comm_utils import * + + +class StreamDataset(IterableDataset): + default_doc_separator = '' + def __init__(self, data, tokenizer, seq_length=1024, doc_separator=None): + self.data = data + self.tokenizer = tokenizer + self.seq_length = seq_length + self.doc_separator = doc_separator or StreamDataset.default_doc_separator + self.it = None + self.iter_count = 0 + self.buffer_tokens = [] + + def state_dict(self): + return { + 'iter_count': self.iter_count, + 'buffer_tokens': self.buffer_tokens, + } + + def load_state_dict(self, state_dict): + self.iter_count = state_dict['iter_count'] + self.buffer_tokens = state_dict['buffer_tokens'] + self.data = self.data.skip(self.iter_count) + + def get_sequence(self): + buffer_tokens = self.buffer_tokens + for x in self.data: + self.iter_count += 1 + curr_tokens = self.tokenizer(self.doc_separator + x['text'])['input_ids'] + buffer_tokens += curr_tokens + while len(buffer_tokens) >= self.seq_length: + tokens = buffer_tokens[:self.seq_length] + buffer_tokens = buffer_tokens[self.seq_length:] + input_ids = torch.tensor(tokens) + self.buffer_tokens = buffer_tokens # update for restore + yield { + 'input_ids': input_ids, + } + + def get_stream(self): + return cycle(self.get_sequence()) + + def __iter__(self): + if self.it is None: + self.it = self.get_stream() + return self.it + + +def get_pile_train_data_loader(args, tokenizer, num_workers=0, state_dict=None): + + data = load_dataset('the_pile', split="train", streaming=True).shuffle(buffer_size=10_000, seed=args.seed) + stream_dataset = StreamDataset(data, tokenizer, args.seq_length) + + if state_dict is not None: + stream_dataset.load_state_dict(state_dict) + + train_data_loader = torch.utils.data.DataLoader(stream_dataset, + batch_size=args.batch_size * args.data_group_size, + shuffle=False, + num_workers=num_workers, + pin_memory=True, + collate_fn=None) + return train_data_loader + +def get_pile_ds(tokenizer, seq_length): + data = load_dataset("the_pile", split="train", streaming=True) + return StreamDataset(data, tokenizer, seq_length) \ No newline at end of file diff --git a/opt_delta.py b/opt_delta.py index 82371a4..481cc9a 100644 --- a/opt_delta.py +++ b/opt_delta.py @@ -9,6 +9,7 @@ from quant import * from prettytable import PrettyTable import copy +import os #from prettytable import PrettyTable def get_opt(model): @@ -539,17 +540,16 @@ def main(args): dataset, seed=args.seed, model=args.model, seqlen=model.seqlen ) - ppl = opt_eval(model, testloader, DEV) - print(ppl) + # ppl = opt_eval(model, testloader, DEV) + # print(ppl) if args.rank > 0: - print("Number of params without low rank ", num_params) - print("Number of params with low rank", num_params - num_params_saved_lr) + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print("Number of params without low rank ", n_params) + print("Number of params with low rank", n_params - num_params_saved_lr) if args.save: opt_pack3(model, quantizers) torch.save(model.state_dict(), args.save) - - n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) return ppl, n_params, comp_time if __name__ == '__main__': @@ -610,6 +610,10 @@ def main(args): '--load', type=str, default='', help='Load quantized model.' ) + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' + ) parser.add_argument( '--benchmark', type=int, default=0, help='Number of tokens to use for benchmarking.' @@ -650,15 +654,17 @@ def main(args): args = parser.parse_args() results = PrettyTable() - results.field_names = ['Bits', 'n_params', 'Time', 'wiki', 'ptb', 'c4'] - for n_bits in [2, 3, 4]: + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [2]: ppls = [] - for dataset in ['wikitext2', 'ptb', 'c4']: + for dataset in ['wikitext2']: args.dataset = dataset args.wbits = n_bits args.save = 'opt-not-delta1.3b-%s-wbits%d.pt' % (dataset, n_bits) ppl, n_params, comp_time = main(args) - ppls.append(ppl) - results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) - print(results) + # ppls.append(ppl) + # results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + # print(results) + # with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + # f.write(str(results)) print('finished.') diff --git a/requirements.txt b/requirements.txt index 7417000..b5c4f04 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,9 @@ transformers loguru -datasets \ No newline at end of file +datasets +safetensors==0.3.0 +datasets==2.10.1 +sentencepiece +git+https://github.com/huggingface/transformers +ninja +prettytable \ No newline at end of file diff --git a/src/fmzip b/src/fmzip new file mode 160000 index 0000000..b41e785 --- /dev/null +++ b/src/fmzip @@ -0,0 +1 @@ +Subproject commit b41e7856f092c80286577b2eb5e1294a764099d6 From 9d7744771d1c04710bf0fe51d69bd23fdcb6e537 Mon Sep 17 00:00:00 2001 From: Fluidstack User Date: Fri, 12 May 2023 17:30:51 +0000 Subject: [PATCH 03/12] gptj --- gptj.py | 308 +++++++++++++++++++++----------------------------- gptq.py | 44 +++++--- modelutils.py | 18 +-- quant.py | 114 ++----------------- 4 files changed, 171 insertions(+), 313 deletions(-) diff --git a/gptj.py b/gptj.py index a4e34e7..a41f0ff 100644 --- a/gptj.py +++ b/gptj.py @@ -1,43 +1,35 @@ import time +import math import torch import torch.nn as nn +import transformers from gptq import * from modelutils import * -from quant import quantize, Quantizer, Quant3Linear, make_quant3 -from prettytable import PrettyTable -from transformers import GPTJForCausalLM +from quant import * -def get_opt(model): +def get_gptj(model): import torch def skip(*args, **kwargs): pass torch.nn.init.kaiming_uniform_ = skip torch.nn.init.uniform_ = skip torch.nn.init.normal_ = skip - model = GPTJForCausalLM.from_pretrained( - model, - revision="float16", - torch_dtype=torch.float16, - ).to("cuda") - model.seqlen = model.config.max_position_embeddings + from transformers import GPTJForCausalLM + model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 return model @torch.no_grad() -def opt_sequential(model, dataloader, dev): +def gptj_sequential(model, dataloader, dev, means=None, stds=None): print('Starting ...') use_cache = model.config.use_cache model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers = model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype @@ -63,13 +55,10 @@ def forward(self, inp, **kwargs): pass layers[0] = layers[0].module + layers = model.transformer.h layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() torch.cuda.empty_cache() outs = torch.zeros_like(inps) @@ -87,9 +76,9 @@ def forward(self, inp, **kwargs): gptq[name] = GPTQ(subset[name]) gptq[name].quantizer = Quantizer() gptq[name].quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits + args.wbits, perchannel=True, sym=False, mse=False ) - + def add_batch(name): def tmp(_, inp, out): gptq[name].add_batch(inp[0].data, out.data) @@ -105,25 +94,24 @@ def tmp(_, inp, out): for name in subset: print(i, name) print('Quantizing ...') - gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) - quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer - gptq[name].free() + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] layers[i] = layer.cpu() del layer - del gptq + del gptq torch.cuda.empty_cache() inps, outs = outs, inps model.config.use_cache = use_cache - + return quantizers + @torch.no_grad() -def opt_eval(model, testenc, dev): +def gptj_eval(model, testenc, dev): print('Evaluating ...') testenc = testenc.input_ids @@ -131,14 +119,9 @@ def opt_eval(model, testenc, dev): use_cache = model.config.use_cache model.config.use_cache = False - layers = model.model.decoder.layers - - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers = model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype @@ -153,27 +136,24 @@ def __init__(self, module): self.module = module def forward(self, inp, **kwargs): inps[cache['i']] = inp - cache['i'] += 1 + cache ['i'] += 1 cache['attention_mask'] = kwargs['attention_mask'] raise ValueError layers[0] = Catcher(layers[0]) for i in range(nsamples): - batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) try: model(batch) except ValueError: pass layers[0] = layers[0].module + layers = model.transformer.h layers[0] = layers[0].cpu() - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() - model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.cpu() - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() torch.cuda.empty_cache() - + outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] @@ -182,11 +162,11 @@ def forward(self, inp, **kwargs): layer = layers[i].to(dev) if args.nearest: - subset = find_layers(layer) + subset = find_layers(layer) for name in subset: quantizer = Quantizer() quantizer.configure( - args.wbits, perchannel=True, sym=args.sym, mse=False + args.wbits, perchannel=True, sym=False, mse=False ) W = subset[name].weight.data quantizer.find_params(W, weight=True) @@ -201,20 +181,14 @@ def forward(self, inp, **kwargs): torch.cuda.empty_cache() inps, outs = outs, inps - if model.model.decoder.final_layer_norm is not None: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) - if model.model.decoder.project_out is not None: - model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.transformer.ln_f = model.transformer.ln_f.to(dev) model.lm_head = model.lm_head.to(dev) - + testenc = testenc.to(dev) nlls = [] for i in range(nsamples): hidden_states = inps[i].unsqueeze(0) - if model.model.decoder.final_layer_norm is not None: - hidden_states = model.model.decoder.final_layer_norm(hidden_states) - if model.model.decoder.project_out is not None: - hidden_states = model.model.decoder.project_out(hidden_states) + hidden_states = model.transformer.ln_f(hidden_states) lm_logits = model.lm_head(hidden_states) shift_logits = lm_logits[:, :-1, :].contiguous() shift_labels = testenc[ @@ -226,61 +200,60 @@ def forward(self, inp, **kwargs): nlls.append(neg_log_likelihood) ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) print(ppl.item()) + model.config.use_cache = use_cache - return ppl.item() -# TODO: perform packing on GPU -def opt_pack3(model, quantizers): +def gptj_pack(model, quantizers, wbits, groupsize): layers = find_layers(model) layers = {n: layers[n] for n in quantizers} - make_quant3(model, quantizers, faster=args.faster_kernel) - qlayers = find_layers(model, [Quant3Linear]) + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) print('Packing ...') for name in qlayers: print(name) - quantizers[name] = quantizers[name].cpu() - qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero) - print('Done.') + quantizers[name],scale,zero = quantizers[name] + quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() + qlayers[name].pack(layers[name], scale, zero) + print('Done!') return model -def load_quant3(model, checkpoint): - from transformers import OPTConfig, OPTForCausalLM - config = OPTConfig.from_pretrained(model) +def load_quant(model, checkpoint, wbits, groupsize): + from transformers import GPTJConfig, GPTJForCausalLM + config = GPTJConfig.from_pretrained(model) def noop(*args, **kwargs): pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop torch.set_default_dtype(torch.half) transformers.modeling_utils._init_weights = False torch.set_default_dtype(torch.half) - model = OPTForCausalLM(config) + model = GPTJForCausalLM(config) torch.set_default_dtype(torch.float) model = model.eval() layers = find_layers(model) - for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']: + for name in ['lm_head']: if name in layers: del layers[name] - make_quant3(model, layers, faster=args.faster_kernel) + make_quant(model, layers, wbits, groupsize) print('Loading model ...') - model.load_state_dict(torch.load(checkpoint)) - model.seqlen = model.config.max_position_embeddings - print('Done.') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done!') return model -def opt_multigpu(model, gpus): - model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0]) - model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0]) - if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: - model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0]) - if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: - model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1]) - if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm: - model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1]) +def gptj_multigpu(model, gpus): + model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) + if hasattr(model.model, 'norm') and model.model.norm: + model.model.norm = model.model.norm.to(gpus[-1]) import copy model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) @@ -289,19 +262,19 @@ def opt_multigpu(model, gpus): class MoveModule(nn.Module): def __init__(self, module): super().__init__() - self.module = module + self_module = module self.dev = next(iter(self.module.parameters())).device def forward(self, *inp, **kwargs): inp = list(inp) if inp[0].device != self.dev: inp[0] = inp[0].to(self.dev) - if cache['mask'] is None or cache['mask'].device != self.dev: + if cache['mask'] is None or cache ['mask'].device != self.dev: cache['mask'] = kwargs['attention_mask'].to(self.dev) kwargs['attention_mask'] = cache['mask'] tmp = self.module(*inp, **kwargs) return tmp - layers = model.model.decoder.layers + layers = model.model.layers pergpu = math.ceil(len(layers) / len(gpus)) for i in range(len(layers)): layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) @@ -318,7 +291,7 @@ def tmp(layer, inp, out): if cache['past']: cache['past'][i] = None return tmp - for i, layer in enumerate(model.model.decoder.layers): + for i, layer in enumerate(model.model.layers): layer.register_forward_hook(clear_past(i)) print('Benchmarking ...') @@ -333,72 +306,35 @@ def sync(): torch.cuda.synchronize(gpu) else: torch.cuda.synchronize() + max_memory = 0 with torch.no_grad(): attention_mask = torch.ones((1, input_ids.numel()), device=DEV) times = [] for i in range(input_ids.numel()): tick = time.time() out = model( - input_ids[:, i].reshape(-1), + input_ids[:, i:i+1], past_key_values=cache['past'], attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) ) sync() times.append(time.time() - tick) print(i, times[-1]) + max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024) if check and i != input_ids.numel() - 1: tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() - cache['past'] = list(out.past_key_values) + cache['past'] = list(out.past_keys_values) del out sync() import numpy as np print('Median:', np.median(times)) if check: print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) -def main(args): - if args.load: - model = load_quant3(args.model, args.load) - else: - model = get_opt(args.model) - model.eval() - - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - - if args.wbits < 16 and not args.nearest: - tick = time.time() - quantizers = opt_sequential(model, dataloader, DEV) - print(time.time() - tick) - - if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - opt_multigpu(model, gpus) - else: - model = model.to(DEV) - if args.benchmark: - input_ids = next(iter(dataloader))[0][:, :args.benchmark] - benchmark(model, input_ids, check=args.check) - if args.load: - exit() - - - dataloader, testloader = get_loaders( - args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - print(dataset) - ppl = opt_eval(model, testloader, DEV) - - if args.save: - opt_pack3(model, quantizers) - torch.save(model.state_dict(), args.save) - return ppl - if __name__ == '__main__': import argparse from datautils import * @@ -406,11 +342,11 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument( - '--model', type=str, default='lnair/opt-1.3b-wikitext2', - help='OPT model to load; pass `facebook/opt-X`.' + '--model', type=str, default='EleutherAI/gpt-j-6b', + help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' ) parser.add_argument( - '--dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.' ) parser.add_argument( @@ -428,30 +364,26 @@ def main(args): parser.add_argument( '--nearest', action='store_true', help='Whether to run the RTN baseline.' - ) + ) parser.add_argument( '--wbits', type=int, default=16, choices=[2, 3, 4, 16], help='#bits to use for quantization; use 16 for evaluating base model.' ) - parser.add_argument( - '--trits', action='store_true', - help='Whether to use trits for quantization.' - ) parser.add_argument( '--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.' ) parser.add_argument( - '--sym', action='store_true', - help='Whether to perform symmetric quantization.' + '--save', type=str, default='', + help='Save the quantized GPT-J model under this name.' ) parser.add_argument( - '--save', type=str, default='', - help='Save quantized checkpoint under this name.' + '--save_safetensors', type=str, default='', + help='Save the quantized GPT-J model as a `.safetensors` ckpt' ) parser.add_argument( '--load', type=str, default='', - help='Load quantized model.' + help='Load the quantized GPT-J model' ) parser.add_argument( '--benchmark', type=int, default=0, @@ -459,34 +391,56 @@ def main(args): ) parser.add_argument( '--check', action='store_true', - help='Whether to compute perplexity during benchmarking for verification.' - ) - parser.add_argument( - '--new-eval', action='store_true', - help='Whether to use the new PTB and C4 eval.' - ) - parser.add_argument( - '--faster-kernel', action='store_true', - help='Whether to use the new faster kernel for benchmarking.' - ) - parser.add_argument( - '--act-order', action='store_true', - help='Whether to apply the activation order GPTQ heuristic' + help='Whether to compute perpexity during benchmarking for verification.' ) + args = parser.parse_args() - - get_opt("EleutherAI/gpt-j-6B") - # results = PrettyTable() - # results.field_names = ['Bits', 'wiki', 'ptb', 'c4'] - # for n_bits in [4, 3, 2]: - # ppls = [] - # for dataset in ['wikitext2', 'ptb', 'c4']: - # args.dataset = dataset - # args.wbits = n_bits - # args.save = 'opt-no-delta-1.3b-%s-wbits%d.pt' % (dataset, n_bits) - # ppl = main(args) - # ppls.append(ppl) - # results.add_row([n_bits, ppls[0], ppls[1], ppls[2]]) - # print(results) - print('finished.') + + if type(args.load) is not str: + args.load = args.load.as_posix() + + if args.load: + model = load_quant(args.model, args.load, args.wbits, args.groupsize) + else: + model = get_gptj(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if not args.load and args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = gptj_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + gptj_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + + for dataset in ['wikitext2', 'ptb', 'c4']: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + gptj_eval(model, testloader, DEV) + + + if args.save: + gptj_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + + if args.save_safetensors: + gptj_pack(model, quantizers, args.wbits, args.groupsize) + from safetensors.torch import save_file as safe_save + safe_save(model.state_dict(), args.save_safetensors) \ No newline at end of file diff --git a/gptq.py b/gptq.py index 2477cac..b4546cc 100644 --- a/gptq.py +++ b/gptq.py @@ -1,16 +1,19 @@ import math import time + import torch -import transformers import torch.nn as nn +import transformers + +from quant import * -from quant import quantize DEBUG = False torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False + class GPTQ: def __init__(self, layer): self.layer = layer @@ -54,7 +57,7 @@ def add_batch(self, inp, out): self.H += inp.matmul(inp.t()) def fasterquant( - self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False + self, blocksize=128, percdamp=.01, groupsize=-1 ): W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -74,11 +77,6 @@ def fasterquant( H[dead, dead] = 1 W[:, dead] = 0 - if actorder: - perm = torch.argsort(torch.diag(H), descending=True) - W = W[:, perm] - H = H[perm][:, perm] - Losses = torch.zeros_like(W) Q = torch.zeros_like(W) @@ -89,6 +87,10 @@ def fasterquant( H = torch.cholesky_inverse(H) H = torch.linalg.cholesky(H, upper=True) Hinv = H + + scale = [] + zero = [] + now_idx = 1 for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) @@ -107,6 +109,11 @@ def fasterquant( if groupsize != -1: if (i1 + i) % groupsize == 0: self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) + + if ((i1 + i) // groupsize) - now_idx == -1: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + now_idx += 1 q = quantize( w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq @@ -130,21 +137,22 @@ def fasterquant( print(torch.sum(Losses)) torch.cuda.synchronize() - total_time = time.time() - tick - # print('time %.2f' % total_time) - error = torch.sum(Losses).item() - # print('error', error) - - if actorder: - invperm = torch.argsort(perm) - Q = Q[:, invperm] + print('time %.2f' % (time.time() - tick)) + print('error', torch.sum(Losses).item()) if isinstance(self.layer, transformers.Conv1D): Q = Q.t() self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) if DEBUG: print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - + + if scale == []: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + scale = torch.cat(scale,dim=1) + zero = torch.cat(zero,dim=1) + return scale,zero + def free(self): if DEBUG: self.inp1 = None @@ -152,4 +160,4 @@ def free(self): self.H = None self.Losses = None self.Trace = None - torch.cuda.empty_cache() + torch.cuda.empty_cache() \ No newline at end of file diff --git a/modelutils.py b/modelutils.py index c93410d..5b36877 100644 --- a/modelutils.py +++ b/modelutils.py @@ -1,8 +1,10 @@ import torch import torch.nn as nn -from transformers import OPTForCausalLM + + DEV = torch.device('cuda:0') + def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): if type(module) in layers: return {name: module} @@ -11,16 +13,4 @@ def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): res.update(find_layers( child, layers=layers, name=name + '.' + name1 if name != '' else name1 )) - return res - -def get_opt(model): - def skip(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = skip - torch.nn.init.uniform_ = skip - torch.nn.init.normal_ = skip - - # model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') - model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.float16) - model.seqlen = model.config.max_position_embeddings - return model \ No newline at end of file + return res \ No newline at end of file diff --git a/quant.py b/quant.py index f8cc1b7..fe58148 100644 --- a/quant.py +++ b/quant.py @@ -1,11 +1,9 @@ -import math import numpy as np import torch import torch.nn as nn +import math def quantize(x, scale, zero, maxq): - if maxq < 0: - return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) return scale * (q - zero) @@ -18,11 +16,10 @@ def __init__(self, shape=1): self.register_buffer('zero', torch.zeros(shape)) def configure( - self, - bits, perchannel=False, sym=True, - mse=False, norm=2.4, grid=100, maxshrink=.8, - trits=False - ): + self, + bits, perchannel=False, sym=True, + mse=False, norm=2.4, grid=100, maxshrink=.8 + ): self.maxq = torch.tensor(2 ** bits - 1) self.perchannel = perchannel self.sym = sym @@ -30,8 +27,6 @@ def configure( self.norm = norm self.grid = grid self.maxshrink = maxshrink - if trits: - self.maxq = torch.tensor(-1) def find_params(self, x, weight=False): dev = x.device @@ -65,15 +60,11 @@ def find_params(self, x, weight=False): xmin[tmp] = -1 xmax[tmp] = +1 - if self.maxq < 0: - self.scale = xmax - self.zero = xmin + self.scale = (xmax - xmin) / self.maxq + if self.sym: + self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) else: - self.scale = (xmax - xmin) / self.maxq - if self.sym: - self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) - else: - self.zero = torch.round(-xmin / self.scale) + self.zero = torch.round(-xmin / self.scale) if self.mse: best = torch.full([x.shape[0]], float('inf'), device=dev) @@ -133,91 +124,6 @@ def ready(self): except: print('CUDA extension not installed.') -# Assumes layer is perfectly divisible into 1024 * 1024 blocks -class Quant3Linear(nn.Module): - - def __init__(self, infeatures, outfeatures, faster=False): - super().__init__() - self.register_buffer('zeros', torch.zeros((outfeatures, 1))) - self.register_buffer('scales', torch.zeros((outfeatures, 1))) - self.register_buffer('bias', torch.zeros(outfeatures)) - self.register_buffer( - 'qweight', torch.zeros((infeatures // 32 * 3, outfeatures), dtype=torch.int) - ) - self.faster = faster - - def pack(self, linear, scales, zeros): - self.zeros = zeros * scales - self.scales = scales.clone() - self.bias = linear.bias.clone() - - intweight = torch.round((linear.weight.data + self.zeros) / self.scales).to(torch.int) - intweight = intweight.t().contiguous() - intweight = intweight.numpy().astype(np.uint32) - qweight = np.zeros( - (intweight.shape[0] // 32 * 3, intweight.shape[1]), dtype=np.uint32 - ) - i = 0 - row = 0 - while row < qweight.shape[0]: - for j in range(i, i + 10): - qweight[row] |= intweight[j] << (3 * (j - i)) - i += 10 - qweight[row] |= intweight[i] << 30 - row += 1 - qweight[row] |= (intweight[i] >> 2) & 1 - i += 1 - for j in range(i, i + 10): - qweight[row] |= intweight[j] << (3 * (j - i) + 1) - i += 10 - qweight[row] |= intweight[i] << 31 - row += 1 - qweight[row] |= (intweight[i] >> 1) & 0x3 - i += 1 - for j in range(i, i + 10): - qweight[row] |= intweight[j] << (3 * (j - i) + 2) - i += 10 - row += 1 - - qweight = qweight.astype(np.int32) - self.qweight = torch.from_numpy(qweight) - - def forward(self, x): - if x.shape[-1] == x.numel(): - outshape = list(x.shape) - y = self.bias.clone() - outshape[-1] = self.bias.numel() - dtype = x.dtype - if self.faster: - x = x.half() - quant_cuda.vecquant3matmul_faster(x, self.qweight, y, self.scales, self.zeros) - else: - x = x.float() - quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.zeros) - y = y.to(dtype) - return y.reshape(outshape) - raise ValueError('Only supports a single token currently.') - -def make_quant3(module, names, name='', faster=False): - if isinstance(module, Quant3Linear): - return - for attr in dir(module): - tmp = getattr(module, attr) - name1 = name + '.' + attr if name != '' else attr - if name1 in names: - setattr( - module, attr, Quant3Linear(tmp.in_features, tmp.out_features, faster=faster) - ) - for name1, child in module.named_children(): - make_quant3(child, names, name + '.' + name1 if name != '' else name1, faster=faster) - -def make_quant_lr(module, r_names, l_names, name='', faster=False): - if isinstance(module, Quant3Linear): - return - for attr in dir(module): - tmp = getattr(module, attr) - name1 = name + '.' + attr if name != '' else attr - # Assumes layer is perfectly divisible into 256 * 256 blocks class QuantLinear(nn.Module): def __init__(self, bits, groupsize, infeatures, outfeatures): @@ -356,4 +262,4 @@ def make_quant(module, names, bits, groupsize, name=''): module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features) ) for name1, child in module.named_children(): - make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) + make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) \ No newline at end of file From c57d9c38cb566c6d61480806abd094fa8c076933 Mon Sep 17 00:00:00 2001 From: Fluidstack User Date: Sat, 13 May 2023 00:16:10 +0000 Subject: [PATCH 04/12] WIP GPTJ --- datautils.py | 50 +++++++++++++++++++++++++++++------------------- gptj.py | 31 ++++++++++++++++++++++-------- requirements.txt | 10 +++------- 3 files changed, 56 insertions(+), 35 deletions(-) diff --git a/datautils.py b/datautils.py index 045121a..2616fc0 100644 --- a/datautils.py +++ b/datautils.py @@ -6,6 +6,7 @@ def set_seed(seed): torch.random.manual_seed(seed) def get_wikitext2(nsamples, seed, seqlen, model): + seqlen = 2048 from datasets import load_dataset traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') @@ -28,6 +29,7 @@ def get_wikitext2(nsamples, seed, seqlen, model): return trainloader, testenc def get_ptb(nsamples, seed, seqlen, model): + seqlen = 2048 from datasets import load_dataset traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation') @@ -50,6 +52,8 @@ def get_ptb(nsamples, seed, seqlen, model): return trainloader, testenc def get_c4(nsamples, seed, seqlen, model): + print("loading the c4 dataset") + seqlen = 2048 from datasets import load_dataset traindata = load_dataset( 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train' @@ -65,14 +69,18 @@ def get_c4(nsamples, seed, seqlen, model): random.seed(seed) trainloader = [] for _ in range(nsamples): - while True: - i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') - if trainenc.input_ids.shape[1] >= seqlen: - break - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') + # while True: + # i = random.randint(0, len(traindata) - 1) + # trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') + # print(trainenc.input_ids.shape) + # if trainenc.input_ids.shape[1] > seqlen - 1: + # break + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + # j = i + seqlen + # inp = trainenc.input_ids[:, i:j] + inp = trainenc.input_ids tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) @@ -80,15 +88,15 @@ def get_c4(nsamples, seed, seqlen, model): import random random.seed(0) valenc = [] - for _ in range(256): - while True: - i = random.randint(0, len(valdata) - 1) - tmp = tokenizer(valdata[i]['text'], return_tensors='pt') - if tmp.input_ids.shape[1] >= seqlen: - break - i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - valenc.append(tmp.input_ids[:, i:j]) + # for _ in range(256): + # while True: + i = random.randint(0, len(valdata) - 1) + tmp = tokenizer(valdata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') + # if tmp.input_ids.shape[1] >= seqlen: + # break + # i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) + # j = i + seqlen + valenc.append(tmp.input_ids) valenc = torch.hstack(valenc) class TokenizerWrapper: def __init__(self, input_ids): @@ -120,6 +128,7 @@ def get_ptb_new(nsamples, seed, seqlen, model): return trainloader, testenc def get_c4_new(nsamples, seed, seqlen, model): + print("loading the c4 new dataset") from datasets import load_dataset traindata = load_dataset( 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train' @@ -130,14 +139,14 @@ def get_c4_new(nsamples, seed, seqlen, model): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - + seqlen = 2048 import random random.seed(seed) trainloader = [] for _ in range(nsamples): while True: i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + trainenc = tokenizer(traindata[i]['text'], max_length = 256, truncation=True, return_tensors='pt') if trainenc.input_ids.shape[1] >= seqlen: break i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) @@ -147,7 +156,7 @@ def get_c4_new(nsamples, seed, seqlen, model): tar[:, :-1] = -100 trainloader.append((inp, tar)) - valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt') + valenc = tokenizer(' '.join(valdata[:1100]['text']), max_length = 256, truncation=True, return_tensors='pt') valenc = valenc.input_ids[:, :(256 * seqlen)] class TokenizerWrapper: @@ -161,6 +170,7 @@ def __init__(self, input_ids): def get_loaders( name, nsamples=128, seed=0, seqlen=2048, model='' ): + print("loading from dataset ", name) if 'wikitext2' in name: return get_wikitext2(nsamples, seed, seqlen, model) if 'ptb' in name: diff --git a/gptj.py b/gptj.py index a41f0ff..cd60177 100644 --- a/gptj.py +++ b/gptj.py @@ -27,8 +27,10 @@ def gptj_sequential(model, dataloader, dev, means=None, stds=None): use_cache = model.config.use_cache model.config.use_cache = False + print(model.transformer.h) layers = model.transformer.h - + print(layers) + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) @@ -50,6 +52,7 @@ def forward(self, inp, **kwargs): layers[0] = Catcher(layers[0]) for batch in dataloader: try: + print(batch[0].shape) model(batch[0].to(dev)) except ValueError: pass @@ -119,8 +122,10 @@ def gptj_eval(model, testenc, dev): use_cache = model.config.use_cache model.config.use_cache = False + print(model.transformer.h) layers = model.transformer.h - + print(layers) + model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) @@ -143,7 +148,13 @@ def forward(self, inp, **kwargs): for i in range(nsamples): batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) try: - model(batch) + print(batch.shape) + # question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + # tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False) + # inputs = tokenizer(question, text, return_tensors="pt") + # print(inputs.shape) + # outputs = model(**inputs) + model(**batch) except ValueError: pass layers[0] = layers[0].module @@ -312,6 +323,7 @@ def sync(): times = [] for i in range(input_ids.numel()): tick = time.time() + out = model( input_ids[:, i:i+1], past_key_values=cache['past'], @@ -346,7 +358,7 @@ def sync(): help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' ) parser.add_argument( - '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'], + '--dataset', type=str, default='c4', choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.' ) parser.add_argument( @@ -394,7 +406,7 @@ def sync(): help='Whether to compute perpexity during benchmarking for verification.' ) - + print("just confirming that I am actually running this stuff") args = parser.parse_args() if type(args.load) is not str: @@ -403,13 +415,16 @@ def sync(): if args.load: model = load_quant(args.model, args.load, args.wbits, args.groupsize) else: + print("getting the model") model = get_gptj(args.model) model.eval() - + print("Done getting the model") + + print("Getting data loaders") dataloader, testloader = get_loaders( args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen ) - + print("finished getting data loaders") if not args.load and args.wbits < 16 and not args.nearest: tick = time.time() quantizers = gptj_sequential(model, dataloader, DEV) @@ -428,7 +443,7 @@ def sync(): exit() - for dataset in ['wikitext2', 'ptb', 'c4']: + for dataset in ['c4']: dataloader, testloader = get_loaders( dataset, seed=args.seed, model=args.model, seqlen=model.seqlen ) diff --git a/requirements.txt b/requirements.txt index b5c4f04..321525d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,5 @@ -transformers -loguru -datasets safetensors==0.3.0 -datasets==2.10.1 +datasets==1.17.0 sentencepiece -git+https://github.com/huggingface/transformers -ninja -prettytable \ No newline at end of file +transformers==4.21.2 +ninja \ No newline at end of file From 671fdab60361336ade958501a4e7aa4e5414fd2b Mon Sep 17 00:00:00 2001 From: ningwanyi Date: Sat, 13 May 2023 03:18:38 +0000 Subject: [PATCH 05/12] gptj working --- gptj.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 6 deletions(-) diff --git a/gptj.py b/gptj.py index cd60177..7f5b7dd 100644 --- a/gptj.py +++ b/gptj.py @@ -8,7 +8,7 @@ from gptq import * from modelutils import * from quant import * - +from prettytable import PrettyTable def get_gptj(model): import torch def skip(*args, **kwargs): @@ -17,8 +17,10 @@ def skip(*args, **kwargs): torch.nn.init.uniform_ = skip torch.nn.init.normal_ = skip from transformers import GPTJForCausalLM + print(model) model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto') - model.seqlen = 2048 + model.seqlen = model.config.max_position_embeddings + print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) return model @torch.no_grad() @@ -154,7 +156,7 @@ def forward(self, inp, **kwargs): # inputs = tokenizer(question, text, return_tensors="pt") # print(inputs.shape) # outputs = model(**inputs) - model(**batch) + model(batch) except ValueError: pass layers[0] = layers[0].module @@ -346,6 +348,46 @@ def sync(): +def main(args): + if args.load: + model = load_quant3(args.model, args.load) + else: + model = get_gptj(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = gptj_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + gptj_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + + dataloader, testloader = get_loaders( + args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + ppl = gptj_eval(model, testloader, DEV) + + if args.save: + gptj_pack3(model, quantizers) + torch.save(model.state_dict(), args.save) + + return ppl if __name__ == '__main__': import argparse @@ -358,7 +400,7 @@ def sync(): help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' ) parser.add_argument( - '--dataset', type=str, default='c4', choices=['wikitext2', 'ptb', 'c4'], + '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.' ) parser.add_argument( @@ -443,7 +485,7 @@ def sync(): exit() - for dataset in ['c4']: + for dataset in ['wikitext2', 'ptb', 'c4']: dataloader, testloader = get_loaders( dataset, seed=args.seed, model=args.model, seqlen=model.seqlen ) @@ -458,4 +500,18 @@ def sync(): if args.save_safetensors: gptj_pack(model, quantizers, args.wbits, args.groupsize) from safetensors.torch import save_file as safe_save - safe_save(model.state_dict(), args.save_safetensors) \ No newline at end of file + safe_save(model.state_dict(), args.save_safetensors) + + results = PrettyTable() + results.field_names = ['Bits', 'wiki', 'ptb', 'c4'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, ppls[0], ppls[1], ppls[2]]) + print(results) + print('finished.') \ No newline at end of file From 72edd428af64a08cf2b3f50d6e0f8dd92d4864df Mon Sep 17 00:00:00 2001 From: kumbong Date: Sat, 13 May 2023 08:05:45 +0000 Subject: [PATCH 06/12] push gptj eval --- datautils.py | 51 ++--- evaluation.sh | 76 +++--- gptj.py | 178 ++++++++------ gptj_delta.py | 624 ++++++++++++++++++++++++++++++++++++++++++++++++++ gptq.py | 42 ++-- quant.py | 112 ++++++++- 6 files changed, 916 insertions(+), 167 deletions(-) create mode 100644 gptj_delta.py diff --git a/datautils.py b/datautils.py index 2616fc0..08bfb66 100644 --- a/datautils.py +++ b/datautils.py @@ -6,46 +6,45 @@ def set_seed(seed): torch.random.manual_seed(seed) def get_wikitext2(nsamples, seed, seqlen, model): - seqlen = 2048 from datasets import load_dataset traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt') - testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt') + trainenc = tokenizer("\n\n".join(traindata['text']), max_length = seqlen, truncation=True, return_tensors='pt') + testenc = tokenizer("\n\n".join(testdata['text']), max_length = seqlen, truncation=True, return_tensors='pt') import random random.seed(seed) trainloader = [] for _ in range(nsamples): - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + # j = i + seqlen + inp = trainenc.input_ids#[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) return trainloader, testenc def get_ptb(nsamples, seed, seqlen, model): - seqlen = 2048 + seqlen = seqlen from datasets import load_dataset traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation') from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt') - testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt') + trainenc = tokenizer("\n\n".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') + testenc = tokenizer("\n\n".join(valdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') import random random.seed(seed) trainloader = [] for _ in range(nsamples): - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + # j = i + seqlen + inp = trainenc.input_ids#[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) @@ -79,7 +78,7 @@ def get_c4(nsamples, seed, seqlen, model): # break # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) # j = i + seqlen - # inp = trainenc.input_ids[:, i:j] + inp = trainenc.input_ids#[:, i:j] inp = trainenc.input_ids tar = inp.clone() tar[:, :-1] = -100 @@ -112,16 +111,16 @@ def get_ptb_new(nsamples, seed, seqlen, model): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt') - testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt') + trainenc = tokenizer(" ".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') + testenc = tokenizer(" ".join(testdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') import random random.seed(seed) trainloader = [] for _ in range(nsamples): - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + # j = i + seqlen + inp = trainenc.input_ids#[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) @@ -144,14 +143,14 @@ def get_c4_new(nsamples, seed, seqlen, model): random.seed(seed) trainloader = [] for _ in range(nsamples): - while True: - i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], max_length = 256, truncation=True, return_tensors='pt') - if trainenc.input_ids.shape[1] >= seqlen: - break - i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - j = i + seqlen - inp = trainenc.input_ids[:, i:j] + # while True: + # i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') + # if trainenc.input_ids.shape[1] >= seqlen: + # break + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + # j = i + seqlen + inp = trainenc.input_ids#[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) diff --git a/evaluation.sh b/evaluation.sh index aaea40f..2714383 100755 --- a/evaluation.sh +++ b/evaluation.sh @@ -1,52 +1,50 @@ -CUDA_VISIBLE_DEVICES=0 python3 opt_delta.py \ +CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \ --groupsize 1024 \ --delta \ - --benchmark_results "file_0.txt" \ + --benchmark_results "delta.txt" \ & -CUDA_VISIBLE_DEVICES=1 python3 opt_delta.py \ +CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \ --groupsize 1024 \ --sparsify_hard_threshold \ --fraction_of_zero 0.9 \ --delta \ - --benchmark_results "file_1.txt" \ + --benchmark_results "delta_sparse_0.9.txt" & -CUDA_VISIBLE_DEVICES=2 python3 opt_delta.py \ +CUDA_VISIBLE_DEVICES=2 python3 gptj.py \ --groupsize 1024 \ --sparsify_hard_threshold \ --fraction_of_zero 0.99 \ --delta \ - --benchmark_results "file_2.txt" \ + --benchmark_results "delta_sparse_0.99.txt" \ & -CUDA_VISIBLE_DEVICES=3 python3 opt_delta.py \ - --groupsize 1024 \ - --delta \ - --rank 32 \ - --benchmark_results "file_3.txt" \ -& -CUDA_VISIBLE_DEVICES=4 python3 opt_delta.py \ - --groupsize 1024 \ - --delta \ - --rank 16 \ - --benchmark_results "file_4.txt" \ -& -CUDA_VISIBLE_DEVICES=5 python3 opt_delta.py \ - --groupsize 1024 \ - --delta \ - --rank 64 \ - --benchmark_results "file_5.txt" \ -& -CUDA_VISIBLE_DEVICES=6 python3 opt_delta.py \ - --groupsize 1024 \ - --delta \ - --rank 32 \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.9 \ - --benchmark_results "file_6.txt" \ -& -CUDA_VISIBLE_DEVICES=7 python3 opt_delta.py \ - --groupsize 1024 \ - --delta \ - --rank 32 \ - --sparsify_hard_threshold \ - --fraction_of_zero 0.99 \ - --benchmark_results "file_7.txt" \ No newline at end of file +CUDA_VISIBLE_DEVICES=3 python3 gptj.py \ + --groupsize 1024 \ + --benchmark_results "base.txt" +# & +# CUDA_VISIBLE_DEVICES=4 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 16 \ +# --benchmark_results "file_4.txt" \ +#& +# CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 64 \ +# --benchmark_results "file_5.txt" \ +# & +# CUDA_VISIBLE_DEVICES=6 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 32 \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.9 \ +# --benchmark_results "file_6.txt" \ +# & +# CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \ +# --groupsize 1024 \ +# --delta \ +# --rank 32 \ +# --sparsify_hard_threshold \ +# --fraction_of_zero 0.99 \ +# --benchmark_results "file_7.txt" \ No newline at end of file diff --git a/gptj.py b/gptj.py index 7f5b7dd..4517509 100644 --- a/gptj.py +++ b/gptj.py @@ -9,6 +9,8 @@ from modelutils import * from quant import * from prettytable import PrettyTable +import os + def get_gptj(model): import torch def skip(*args, **kwargs): @@ -17,7 +19,7 @@ def skip(*args, **kwargs): torch.nn.init.uniform_ = skip torch.nn.init.normal_ = skip from transformers import GPTJForCausalLM - print(model) + # print(model) model = GPTJForCausalLM.from_pretrained(model, torch_dtype='auto') model.seqlen = model.config.max_position_embeddings print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) @@ -29,9 +31,9 @@ def gptj_sequential(model, dataloader, dev, means=None, stds=None): use_cache = model.config.use_cache model.config.use_cache = False - print(model.transformer.h) + #print(model.transformer.h) layers = model.transformer.h - print(layers) + #print(layers) model.transformer.wte = model.transformer.wte.to(dev) layers[0] = layers[0].to(dev) @@ -54,7 +56,6 @@ def forward(self, inp, **kwargs): layers[0] = Catcher(layers[0]) for batch in dataloader: try: - print(batch[0].shape) model(batch[0].to(dev)) except ValueError: pass @@ -114,6 +115,99 @@ def tmp(_, inp, out): return quantizers +@torch.no_grad() +def gptj_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.transformer.h + delta_layers = delta_model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers @torch.no_grad() def gptj_eval(model, testenc, dev): @@ -124,7 +218,7 @@ def gptj_eval(model, testenc, dev): use_cache = model.config.use_cache model.config.use_cache = False - print(model.transformer.h) + # print(model.transformer.h) layers = model.transformer.h print(layers) @@ -150,12 +244,6 @@ def forward(self, inp, **kwargs): for i in range(nsamples): batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) try: - print(batch.shape) - # question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - # tokenizer = transformers.AutoTokenizer.from_pretrained(model, use_fast=False) - # inputs = tokenizer(question, text, return_tensors="pt") - # print(inputs.shape) - # outputs = model(**inputs) model(batch) except ValueError: pass @@ -384,7 +472,7 @@ def main(args): ppl = gptj_eval(model, testloader, DEV) if args.save: - gptj_pack3(model, quantizers) + gptj_pack(model, quantizers, args.wbits, args.groupsize) torch.save(model.state_dict(), args.save) return ppl @@ -447,63 +535,15 @@ def main(args): '--check', action='store_true', help='Whether to compute perpexity during benchmarking for verification.' ) - - print("just confirming that I am actually running this stuff") - args = parser.parse_args() - - if type(args.load) is not str: - args.load = args.load.as_posix() - - if args.load: - model = load_quant(args.model, args.load, args.wbits, args.groupsize) - else: - print("getting the model") - model = get_gptj(args.model) - model.eval() - print("Done getting the model") - - print("Getting data loaders") - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' ) - print("finished getting data loaders") - if not args.load and args.wbits < 16 and not args.nearest: - tick = time.time() - quantizers = gptj_sequential(model, dataloader, DEV) - print(time.time() - tick) - - if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - gptj_multigpu(model, gpus) - else: - model = model.to(DEV) - if args.benchmark: - input_ids = next(iter(dataloader))[0][:, :args.benchmark] - benchmark(model, input_ids, check=args.check) - if args.load: - exit() - - - for dataset in ['wikitext2', 'ptb', 'c4']: - dataloader, testloader = get_loaders( - dataset, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - print(dataset) - gptj_eval(model, testloader, DEV) - - - if args.save: - gptj_pack(model, quantizers, args.wbits, args.groupsize) - torch.save(model.state_dict(), args.save) - - if args.save_safetensors: - gptj_pack(model, quantizers, args.wbits, args.groupsize) - from safetensors.torch import save_file as safe_save - safe_save(model.state_dict(), args.save_safetensors) + + args = parser.parse_args() results = PrettyTable() - results.field_names = ['Bits', 'wiki', 'ptb', 'c4'] + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] for n_bits in [4, 3, 2]: ppls = [] for dataset in ['wikitext2', 'ptb', 'c4']: @@ -512,6 +552,8 @@ def main(args): args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits) ppl = main(args) ppls.append(ppl) - results.add_row([n_bits, ppls[0], ppls[1], ppls[2]]) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) print(results) - print('finished.') \ No newline at end of file + with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + f.write(str(results)) + print('finished.') diff --git a/gptj_delta.py b/gptj_delta.py new file mode 100644 index 0000000..2ea5dfc --- /dev/null +++ b/gptj_delta.py @@ -0,0 +1,624 @@ +import time +import math + +import torch +import torch.nn as nn +import transformers + +from gptq import * +from modelutils import * +from quant import * +from prettytable import PrettyTable +import os +import copy + +def get_gptj(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import GPTJForCausalLM + model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + model.seqlen = model.config.max_position_embeddings + print("Num params is", sum(p.numel() for p in model.parameters() if p.requires_grad)) + return model + +@torch.no_grad() +def gptj_sequential(model, dataloader, dev, means=None, stds=None): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + #print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.transformer.h + delta_layers = delta_model.transformer.h + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = delta_layers[i].to(dev) + original_layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + original_outs[j] = original_layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = original_outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def gptj_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + # print(model.transformer.h) + layers = model.transformer.h + print(layers) + + model.transformer.wte = model.transformer.wte.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache ['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) + try: + # print(batch.shape) + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers = model.transformer.h + layers[0] = layers[0].cpu() + model.transformer.wte = model.transformer.wte.cpu() + model.transformer.ln_f = model.transformer.ln_f.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + model.transformer.ln_f = model.transformer.ln_f.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + hidden_states = model.transformer.ln_f(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + + model.config.use_cache = use_cache + +def gptj_pack(model, quantizers, wbits, groupsize): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name],scale,zero = quantizers[name] + quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() + qlayers[name].pack(layers[name], scale, zero) + print('Done!') + return model + +def load_quant(model, checkpoint, wbits, groupsize): + from transformers import GPTJConfig, GPTJForCausalLM + config = GPTJConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = GPTJForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done!') + + return model + +def gptj_multigpu(model, gpus): + model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) + if hasattr(model.model, 'norm') and model.model.norm: + model.model.norm = model.model.norm.to(gpus[-1]) + import copy + model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) + + cache = {'mask': None} + + class MoveModule(nn.Module): + def __init__(self, module): + super().__init__() + self_module = module + self.dev = next(iter(self.module.parameters())).device + def forward(self, *inp, **kwargs): + inp = list(inp) + if inp[0].device != self.dev: + inp[0] = inp[0].to(self.dev) + if cache['mask'] is None or cache ['mask'].device != self.dev: + cache['mask'] = kwargs['attention_mask'].to(self.dev) + kwargs['attention_mask'] = cache['mask'] + tmp = self.module(*inp, **kwargs) + return tmp + + layers = model.model.layers + pergpu = math.ceil(len(layers) / len(gpus)) + for i in range(len(layers)): + layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) + + model.gpus = gpus + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.model.layers): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + max_memory = 0 + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + + out = model( + input_ids[:, i:i+1], + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024) + if check and i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_keys_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + if check: + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) + + + +def main(args): + print(args) + num_params_saved_lr = 0 + num_params = 0 + if args.load: + model = load_quant3(args.model, args.load) + else: + if args.delta and args.wbits<16: + model = get_gptj(args.model) + model.eval() + base_model = get_gptj(args.base_model) + base_model.eval() + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + original_finetuned_model = copy.deepcopy(model) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data-base_p.data).clone() + else: + model = get_gptj(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + if args.delta: + tick = time.time() + quantizers = gptj_sequential_delta(original_finetuned_model, model, dataloader, DEV) + + comp_time = time.time()-tick + else: + quantizers = gptj_sequential(model, dataloader, DEV) + + if args.delta and args.wbits<16: + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + if args.sparsify_hard_threshold: + print('Hard Thresholding...') + W = finetuned_p.data + finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + if args.rank>0 and len(finetuned_p.shape) == 2: + print('Finding Low Rank Approximation...') + A = finetuned_p.data.float() + U, S, Vh = torch.pca_lowrank(A, q=args.rank, center=True, niter=5) + A = U @ torch.diag_embed(S) @ Vh.T + finetuned_p.data = A.half() + num_params_saved_lr += torch.numel(A) - (torch.numel(U) + torch.numel(S) + torch.numel(Vh)) + num_params += torch.numel(finetuned_p.data) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + gptj_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + if args.load: + exit() + + dataset = args.dataset + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + ppl = gptj_eval(model, testloader, DEV) + print(ppl) + + if args.rank > 0: + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print("Number of params without low rank ", n_params) + print("Number of params with low rank", n_params - num_params_saved_lr) + if args.save: + gptj_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + return ppl + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='togethercomputer/GPT-JT-6B-v1', + help='GPT-J finetuned model to load; pass `togethercomputer/GPT-JT-6B-v1`.' + ) + parser.add_argument( + '--base_model', type=str, default='EleutherAI/gpt-j-6b', + help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' + ) + parser.add_argument( + '--dataset', type=str, default='ptb', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save the quantized GPT-J model under this name.' + ) + parser.add_argument( + '--save_safetensors', type=str, default='', + help='Save the quantized GPT-J model as a `.safetensors` ckpt' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load the quantized GPT-J model' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perpexity during benchmarking for verification.' + ) + parser.add_argument( + '--delta', action='store_true', + help='Whether to use delta compression' + ) + parser.add_argument( + '--sparsify_hard_threshold', action='store_true', + help='Whether to add sparsity' + ) + parser.add_argument( + '--fraction_of_zero', type=float, default=0.99, + help='Sparsity ratio' + ) + parser.add_argument( + '--benchmark_results', type=str, default='', + help='store benchmark results' + ) + parser.add_argument( + '--sym', action='store_true', default=True, + help='Whether to use symmetric quantization' + ) + parser.add_argument( + '--trits', action='store_true', default=False, + help='Whether to use trits' + ) + parser.add_argument('--act_order', type=str, default=False) + + args = parser.parse_args() + + results = PrettyTable() + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [4, 3, 2]: + ppls = [] + for dataset in ['wikitext2', 'ptb', 'c4']: + args.dataset = dataset + args.wbits = n_bits + args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits) + ppl = main(args) + ppls.append(ppl) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(results) + with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: + f.write(str(results)) + print('finished.') diff --git a/gptq.py b/gptq.py index b4546cc..8f719e1 100644 --- a/gptq.py +++ b/gptq.py @@ -1,19 +1,16 @@ import math import time - import torch -import torch.nn as nn import transformers +import torch.nn as nn -from quant import * - +from quant import quantize DEBUG = False torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False - class GPTQ: def __init__(self, layer): self.layer = layer @@ -57,7 +54,7 @@ def add_batch(self, inp, out): self.H += inp.matmul(inp.t()) def fasterquant( - self, blocksize=128, percdamp=.01, groupsize=-1 + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False ): W = self.layer.weight.data.clone() if isinstance(self.layer, nn.Conv2d): @@ -77,6 +74,11 @@ def fasterquant( H[dead, dead] = 1 W[:, dead] = 0 + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + Losses = torch.zeros_like(W) Q = torch.zeros_like(W) @@ -87,10 +89,6 @@ def fasterquant( H = torch.cholesky_inverse(H) H = torch.linalg.cholesky(H, upper=True) Hinv = H - - scale = [] - zero = [] - now_idx = 1 for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) @@ -109,11 +107,6 @@ def fasterquant( if groupsize != -1: if (i1 + i) % groupsize == 0: self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) - - if ((i1 + i) // groupsize) - now_idx == -1: - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) - now_idx += 1 q = quantize( w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq @@ -137,22 +130,21 @@ def fasterquant( print(torch.sum(Losses)) torch.cuda.synchronize() - print('time %.2f' % (time.time() - tick)) - print('error', torch.sum(Losses).item()) + total_time = time.time() - tick + # print('time %.2f' % total_time) + error = torch.sum(Losses).item() + # print('error', error) + + if actorder: + invperm = torch.argsort(perm) + Q = Q[:, invperm] if isinstance(self.layer, transformers.Conv1D): Q = Q.t() self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) if DEBUG: print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - - if scale == []: - scale.append(self.quantizer.scale) - zero.append(self.quantizer.zero) - scale = torch.cat(scale,dim=1) - zero = torch.cat(zero,dim=1) - return scale,zero - + def free(self): if DEBUG: self.inp1 = None diff --git a/quant.py b/quant.py index fe58148..946da79 100644 --- a/quant.py +++ b/quant.py @@ -1,9 +1,11 @@ +import math import numpy as np import torch import torch.nn as nn -import math def quantize(x, scale, zero, maxq): + if maxq < 0: + return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) return scale * (q - zero) @@ -16,10 +18,11 @@ def __init__(self, shape=1): self.register_buffer('zero', torch.zeros(shape)) def configure( - self, - bits, perchannel=False, sym=True, - mse=False, norm=2.4, grid=100, maxshrink=.8 - ): + self, + bits, perchannel=False, sym=True, + mse=False, norm=2.4, grid=100, maxshrink=.8, + trits=False + ): self.maxq = torch.tensor(2 ** bits - 1) self.perchannel = perchannel self.sym = sym @@ -27,6 +30,8 @@ def configure( self.norm = norm self.grid = grid self.maxshrink = maxshrink + if trits: + self.maxq = torch.tensor(-1) def find_params(self, x, weight=False): dev = x.device @@ -60,11 +65,15 @@ def find_params(self, x, weight=False): xmin[tmp] = -1 xmax[tmp] = +1 - self.scale = (xmax - xmin) / self.maxq - if self.sym: - self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) + if self.maxq < 0: + self.scale = xmax + self.zero = xmin else: - self.zero = torch.round(-xmin / self.scale) + self.scale = (xmax - xmin) / self.maxq + if self.sym: + self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) + else: + self.zero = torch.round(-xmin / self.scale) if self.mse: best = torch.full([x.shape[0]], float('inf'), device=dev) @@ -124,6 +133,91 @@ def ready(self): except: print('CUDA extension not installed.') +# Assumes layer is perfectly divisible into 1024 * 1024 blocks +class Quant3Linear(nn.Module): + + def __init__(self, infeatures, outfeatures, faster=False): + super().__init__() + self.register_buffer('zeros', torch.zeros((outfeatures, 1))) + self.register_buffer('scales', torch.zeros((outfeatures, 1))) + self.register_buffer('bias', torch.zeros(outfeatures)) + self.register_buffer( + 'qweight', torch.zeros((infeatures // 32 * 3, outfeatures), dtype=torch.int) + ) + self.faster = faster + + def pack(self, linear, scales, zeros): + self.zeros = zeros * scales + self.scales = scales.clone() + self.bias = linear.bias.clone() + + intweight = torch.round((linear.weight.data + self.zeros) / self.scales).to(torch.int) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 32 * 3, intweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i)) + i += 10 + qweight[row] |= intweight[i] << 30 + row += 1 + qweight[row] |= (intweight[i] >> 2) & 1 + i += 1 + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i) + 1) + i += 10 + qweight[row] |= intweight[i] << 31 + row += 1 + qweight[row] |= (intweight[i] >> 1) & 0x3 + i += 1 + for j in range(i, i + 10): + qweight[row] |= intweight[j] << (3 * (j - i) + 2) + i += 10 + row += 1 + + qweight = qweight.astype(np.int32) + self.qweight = torch.from_numpy(qweight) + + def forward(self, x): + if x.shape[-1] == x.numel(): + outshape = list(x.shape) + y = self.bias.clone() + outshape[-1] = self.bias.numel() + dtype = x.dtype + if self.faster: + x = x.half() + quant_cuda.vecquant3matmul_faster(x, self.qweight, y, self.scales, self.zeros) + else: + x = x.float() + quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.zeros) + y = y.to(dtype) + return y.reshape(outshape) + raise ValueError('Only supports a single token currently.') + +def make_quant3(module, names, name='', faster=False): + if isinstance(module, Quant3Linear): + return + for attr in dir(module): + tmp = getattr(module, attr) + name1 = name + '.' + attr if name != '' else attr + if name1 in names: + setattr( + module, attr, Quant3Linear(tmp.in_features, tmp.out_features, faster=faster) + ) + for name1, child in module.named_children(): + make_quant3(child, names, name + '.' + name1 if name != '' else name1, faster=faster) + +def make_quant_lr(module, r_names, l_names, name='', faster=False): + if isinstance(module, Quant3Linear): + return + for attr in dir(module): + tmp = getattr(module, attr) + name1 = name + '.' + attr if name != '' else attr + # Assumes layer is perfectly divisible into 256 * 256 blocks class QuantLinear(nn.Module): def __init__(self, bits, groupsize, infeatures, outfeatures): From 5025ab4d26b5b1565bf67bfcf155d78e2e0e9963 Mon Sep 17 00:00:00 2001 From: kumbong Date: Sat, 13 May 2023 23:32:17 +0000 Subject: [PATCH 07/12] update gptj --- .gitignore | 1 + datautils.py | 131 +++++----------------- delta.txt | 5 + file_5.txt | 12 +- gptj.py | 35 ++++-- gptj_delta.py | 37 +++++-- gptq.py | 12 +- llama.py | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++ quant.py | 4 +- 9 files changed, 402 insertions(+), 137 deletions(-) create mode 100644 delta.txt create mode 100644 llama.py diff --git a/.gitignore b/.gitignore index 761a9f6..83ed498 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.pt build/ dist/ .idea diff --git a/datautils.py b/datautils.py index 08bfb66..a269a22 100644 --- a/datautils.py +++ b/datautils.py @@ -1,10 +1,12 @@ import numpy as np import torch + def set_seed(seed): np.random.seed(seed) torch.random.manual_seed(seed) + def get_wikitext2(nsamples, seed, seqlen, model): from datasets import load_dataset traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') @@ -12,47 +14,44 @@ def get_wikitext2(nsamples, seed, seqlen, model): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer("\n\n".join(traindata['text']), max_length = seqlen, truncation=True, return_tensors='pt') - testenc = tokenizer("\n\n".join(testdata['text']), max_length = seqlen, truncation=True, return_tensors='pt') + trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt') + testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt') import random random.seed(seed) trainloader = [] for _ in range(nsamples): - # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - # j = i + seqlen - inp = trainenc.input_ids#[:, i:j] + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) return trainloader, testenc def get_ptb(nsamples, seed, seqlen, model): - seqlen = seqlen from datasets import load_dataset traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation') from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer("\n\n".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') - testenc = tokenizer("\n\n".join(valdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') + trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt') + testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt') import random random.seed(seed) trainloader = [] for _ in range(nsamples): - # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - # j = i + seqlen - inp = trainenc.input_ids#[:, i:j] + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) return trainloader, testenc def get_c4(nsamples, seed, seqlen, model): - print("loading the c4 dataset") - seqlen = 2048 from datasets import load_dataset traindata = load_dataset( 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train' @@ -68,18 +67,14 @@ def get_c4(nsamples, seed, seqlen, model): random.seed(seed) trainloader = [] for _ in range(nsamples): - i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') - # while True: - # i = random.randint(0, len(traindata) - 1) - # trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') - # print(trainenc.input_ids.shape) - # if trainenc.input_ids.shape[1] > seqlen - 1: - # break - # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - # j = i + seqlen - inp = trainenc.input_ids#[:, i:j] - inp = trainenc.input_ids + while True: + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] tar = inp.clone() tar[:, :-1] = -100 trainloader.append((inp, tar)) @@ -87,15 +82,15 @@ def get_c4(nsamples, seed, seqlen, model): import random random.seed(0) valenc = [] - # for _ in range(256): - # while True: - i = random.randint(0, len(valdata) - 1) - tmp = tokenizer(valdata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') - # if tmp.input_ids.shape[1] >= seqlen: - # break - # i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) - # j = i + seqlen - valenc.append(tmp.input_ids) + for _ in range(256): + while True: + i = random.randint(0, len(valdata) - 1) + tmp = tokenizer(valdata[i]['text'], return_tensors='pt') + if tmp.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + valenc.append(tmp.input_ids[:, i:j]) valenc = torch.hstack(valenc) class TokenizerWrapper: def __init__(self, input_ids): @@ -104,79 +99,13 @@ def __init__(self, input_ids): return trainloader, valenc -def get_ptb_new(nsamples, seed, seqlen, model): - from datasets import load_dataset - traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') - testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - trainenc = tokenizer(" ".join(traindata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') - testenc = tokenizer(" ".join(testdata['sentence']), max_length = seqlen, truncation=True, return_tensors='pt') - - import random - random.seed(seed) - trainloader = [] - for _ in range(nsamples): - # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - # j = i + seqlen - inp = trainenc.input_ids#[:, i:j] - tar = inp.clone() - tar[:, :-1] = -100 - trainloader.append((inp, tar)) - return trainloader, testenc - -def get_c4_new(nsamples, seed, seqlen, model): - print("loading the c4 new dataset") - from datasets import load_dataset - traindata = load_dataset( - 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train' - ) - valdata = load_dataset( - 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation' - ) - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) - seqlen = 2048 - import random - random.seed(seed) - trainloader = [] - for _ in range(nsamples): - # while True: - # i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], max_length = seqlen, truncation=True, return_tensors='pt') - # if trainenc.input_ids.shape[1] >= seqlen: - # break - # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) - # j = i + seqlen - inp = trainenc.input_ids#[:, i:j] - tar = inp.clone() - tar[:, :-1] = -100 - trainloader.append((inp, tar)) - - valenc = tokenizer(' '.join(valdata[:1100]['text']), max_length = 256, truncation=True, return_tensors='pt') - valenc = valenc.input_ids[:, :(256 * seqlen)] - - class TokenizerWrapper: - def __init__(self, input_ids): - self.input_ids = input_ids - valenc = TokenizerWrapper(valenc) - - return trainloader, valenc - def get_loaders( name, nsamples=128, seed=0, seqlen=2048, model='' ): - print("loading from dataset ", name) if 'wikitext2' in name: return get_wikitext2(nsamples, seed, seqlen, model) if 'ptb' in name: - if 'new' in name: - return get_ptb_new(nsamples, seed, seqlen, model) return get_ptb(nsamples, seed, seqlen, model) if 'c4' in name: - if 'new' in name: - return get_c4_new(nsamples, seed, seqlen, model) - return get_c4(nsamples, seed, seqlen, model) + return get_c4(nsamples, seed, seqlen, model) \ No newline at end of file diff --git a/delta.txt b/delta.txt new file mode 100644 index 0000000..a1c817b --- /dev/null +++ b/delta.txt @@ -0,0 +1,5 @@ ++------+-----------+------+ +| Bits | wikitext2 | ptb | ++------+-----------+------+ +| 4 | None | None | ++------+-----------+------+ \ No newline at end of file diff --git a/file_5.txt b/file_5.txt index 2655a74..256a39d 100644 --- a/file_5.txt +++ b/file_5.txt @@ -1,7 +1,5 @@ -+------+-----------+--------------------+--------------------+--------------------+--------------------+ -| Bits | n_params | Time | wiki | ptb | c4 | -+------+-----------+--------------------+--------------------+--------------------+--------------------+ -| 2 | 107356160 | 262.34580183029175 | 12.373908996582031 | 18.664175033569336 | 15.718718528747559 | -| 3 | 107356160 | 271.34020018577576 | 12.558426856994629 | 19.13666343688965 | 16.14783477783203 | -| 4 | 107356160 | 255.5096390247345 | 12.59843921661377 | 19.159931182861328 | 16.166603088378906 | -+------+-----------+--------------------+--------------------+--------------------+--------------------+ \ No newline at end of file ++------+-----------+--------------------+--------------------+--------------------+ +| Bits | n_params | wiki | ptb | c4 | ++------+-----------+--------------------+--------------------+--------------------+ +| 4 | 107356160 | 2.9947431087493896 | 1.011309266090393 | 1.0010896921157837 | ++------+-----------+--------------------+--------------------+--------------------+ \ No newline at end of file diff --git a/gptj.py b/gptj.py index 4517509..a98cdd0 100644 --- a/gptj.py +++ b/gptj.py @@ -98,8 +98,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - print(i, name) - print('Quantizing ...') + # print(i, name) + # print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] @@ -188,8 +188,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - print(i, name) - print('Quantizing ...') + # print(i, name) + # print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer gptq[name].free() @@ -259,7 +259,7 @@ def forward(self, inp, **kwargs): attention_mask = cache['attention_mask'] for i in range(len(layers)): - print(i) + #print(i) layer = layers[i].to(dev) if args.nearest: @@ -310,13 +310,13 @@ def gptj_pack(model, quantizers, wbits, groupsize): layers = {n: layers[n] for n in quantizers} make_quant(model, quantizers, wbits, groupsize) qlayers = find_layers(model, [QuantLinear]) - print('Packing ...') + #print('Packing ...') for name in qlayers: print(name) quantizers[name],scale,zero = quantizers[name] quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() qlayers[name].pack(layers[name], scale, zero) - print('Done!') + #print('Done!') return model def load_quant(model, checkpoint, wbits, groupsize): @@ -442,6 +442,12 @@ def main(args): else: model = get_gptj(args.model) model.eval() + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + ppl = gptj_eval(model, testloader, DEV) + print(ppl) + exit() dataloader, testloader = get_loaders( args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen @@ -484,7 +490,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument( - '--model', type=str, default='EleutherAI/gpt-j-6b', + '--model', type=str, default='togethercomputer/GPT-JT-6B-v1', help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' ) parser.add_argument( @@ -539,20 +545,27 @@ def main(args): '--benchmark_results', type=str, default='', help='store benchmark results' ) + parser.add_argument( + '--rank', type=int, default=0, + help='The rank to use for decomposing each matrices' + ) args = parser.parse_args() results = PrettyTable() - results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + results.field_names = ['Bits', 'wikitext2', 'ptb'] for n_bits in [4, 3, 2]: ppls = [] - for dataset in ['wikitext2', 'ptb', 'c4']: + for dataset in ['wikitext2', 'ptb']: + print(n_bits) + print(dataset) args.dataset = dataset args.wbits = n_bits args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits) ppl = main(args) ppls.append(ppl) - results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(ppl) + results.add_row([n_bits] + ppls) print(results) with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: f.write(str(results)) diff --git a/gptj_delta.py b/gptj_delta.py index 2ea5dfc..d2302e6 100644 --- a/gptj_delta.py +++ b/gptj_delta.py @@ -12,6 +12,15 @@ import os import copy + +def hard_threshold(x, fraction_of_zero=0.1): + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + return mask * x + def get_gptj(model): import torch def skip(*args, **kwargs): @@ -98,8 +107,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - print(i, name) - print('Quantizing ...') + # print(i, name) + # print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] @@ -188,8 +197,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - print(i, name) - print('Quantizing ...') + # print(i, name) + # print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer gptq[name].free() @@ -506,15 +515,16 @@ def main(args): ) ppl = gptj_eval(model, testloader, DEV) - print(ppl) + print('perpexity for model is', ppl) if args.rank > 0: n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Number of params without low rank ", n_params) print("Number of params with low rank", n_params - num_params_saved_lr) if args.save: - gptj_pack(model, quantizers, args.wbits, args.groupsize) - torch.save(model.state_dict(), args.save) + # gptj_pack(model, quantizers, args.wbits, args.groupsize) + # torch.save(model.state_dict(), args.save) + pass return ppl if __name__ == '__main__': @@ -603,21 +613,28 @@ def main(args): '--trits', action='store_true', default=False, help='Whether to use trits' ) + parser.add_argument( + '--rank', type=int, default=0, + help='The rank to use for decomposing each matrices' + ) parser.add_argument('--act_order', type=str, default=False) args = parser.parse_args() results = PrettyTable() - results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + results.field_names = ['Bits', 'wikitext2', 'ptb'] for n_bits in [4, 3, 2]: ppls = [] - for dataset in ['wikitext2', 'ptb', 'c4']: + for dataset in ['wikitext2', 'ptb']: + print(n_bits) + print(dataset) args.dataset = dataset args.wbits = n_bits args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits) ppl = main(args) ppls.append(ppl) - results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) + print(ppl) + results.add_row([n_bits] + ppls) print(results) with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: f.write(str(results)) diff --git a/gptq.py b/gptq.py index 8f719e1..e60f1fc 100644 --- a/gptq.py +++ b/gptq.py @@ -126,14 +126,14 @@ def fasterquant( if DEBUG: self.layer.weight.data[:, :i2] = Q[:, :i2] self.layer.weight.data[:, i2:] = W[:, i2:] - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - print(torch.sum(Losses)) + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + #print(torch.sum(Losses)) torch.cuda.synchronize() total_time = time.time() - tick - # print('time %.2f' % total_time) + # #print('time %.2f' % total_time) error = torch.sum(Losses).item() - # print('error', error) + # #print('error', error) if actorder: invperm = torch.argsort(perm) @@ -143,8 +143,8 @@ def fasterquant( Q = Q.t() self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) if DEBUG: - print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) - + #print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + pass def free(self): if DEBUG: self.inp1 = None diff --git a/llama.py b/llama.py new file mode 100644 index 0000000..f1591d9 --- /dev/null +++ b/llama.py @@ -0,0 +1,302 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * + + +def get_llama(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 + return model + +@torch.no_grad() +def llama_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.norm is not None: + model.model.norm = model.model.norm.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.norm is not None: + hidden_states = model.model.norm(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'model', type=str, + help='LlaMa model to load; pass location of hugginface converted checkpoint.' + ) + parser.add_argument( + 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + parser.add_argument( + '--true-sequential', action='store_true', + help='Whether to run in true sequential model.' + ) + + args = parser.parse_args() + + model = get_llama(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = llama_sequential(model, dataloader, DEV) + print(time.time() - tick) + + datasets = ['wikitext2', 'ptb', 'c4'] + if args.new_eval: + datasets = ['wikitext2', 'ptb-new', 'c4-new'] + for dataset in datasets: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + llama_eval(model, testloader, DEV) \ No newline at end of file diff --git a/quant.py b/quant.py index 946da79..f57d6d7 100644 --- a/quant.py +++ b/quant.py @@ -131,8 +131,8 @@ def ready(self): try: import quant_cuda except: - print('CUDA extension not installed.') - + #print('CUDA extension not installed.') + pass # Assumes layer is perfectly divisible into 1024 * 1024 blocks class Quant3Linear(nn.Module): From 4cae72910120b22043e203f316134042e54a6c9d Mon Sep 17 00:00:00 2001 From: kumbong Date: Sun, 14 May 2023 06:26:46 +0000 Subject: [PATCH 08/12] WIP --- delta_2_bits.txt | 32 +++ delta_2bits_sparse_099.txt | 8 + delta_4bits.txt | 30 +++ delta_4bits_sparse_09.txt | 8 + delta_sparse_09.txt | 5 + delta_sparse_099.txt | 5 + evaluation.sh | 48 ++-- file_5.txt | 7 +- gptj.py | 67 ++---- gptj_delta.py | 80 ++----- llama_delta.py | 441 +++++++++++++++++++++++++++++++++++++ 11 files changed, 604 insertions(+), 127 deletions(-) create mode 100644 delta_2_bits.txt create mode 100644 delta_2bits_sparse_099.txt create mode 100644 delta_4bits.txt create mode 100644 delta_4bits_sparse_09.txt create mode 100644 delta_sparse_09.txt create mode 100644 delta_sparse_099.txt create mode 100644 llama_delta.py diff --git a/delta_2_bits.txt b/delta_2_bits.txt new file mode 100644 index 0000000..41b76c4 --- /dev/null +++ b/delta_2_bits.txt @@ -0,0 +1,32 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt new file mode 100644 index 0000000..cfb6519 --- /dev/null +++ b/delta_2bits_sparse_099.txt @@ -0,0 +1,8 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... diff --git a/delta_4bits.txt b/delta_4bits.txt new file mode 100644 index 0000000..39698f4 --- /dev/null +++ b/delta_4bits.txt @@ -0,0 +1,30 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt new file mode 100644 index 0000000..cfb6519 --- /dev/null +++ b/delta_4bits_sparse_09.txt @@ -0,0 +1,8 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... diff --git a/delta_sparse_09.txt b/delta_sparse_09.txt new file mode 100644 index 0000000..a1c817b --- /dev/null +++ b/delta_sparse_09.txt @@ -0,0 +1,5 @@ ++------+-----------+------+ +| Bits | wikitext2 | ptb | ++------+-----------+------+ +| 4 | None | None | ++------+-----------+------+ \ No newline at end of file diff --git a/delta_sparse_099.txt b/delta_sparse_099.txt new file mode 100644 index 0000000..a1c817b --- /dev/null +++ b/delta_sparse_099.txt @@ -0,0 +1,5 @@ ++------+-----------+------+ +| Bits | wikitext2 | ptb | ++------+-----------+------+ +| 4 | None | None | ++------+-----------+------+ \ No newline at end of file diff --git a/evaluation.sh b/evaluation.sh index 2714383..92d2232 100755 --- a/evaluation.sh +++ b/evaluation.sh @@ -1,25 +1,41 @@ -CUDA_VISIBLE_DEVICES=7 python3 gptj_delta.py \ +CUDA_VISIBLE_DEVICES=7 python3 -u llama_delta.py \ + --wbits 4 \ + --true-sequential --act-order --new-eval\ + --groupsize 1024 > delta_4bits.txt & + +CUDA_VISIBLE_DEVICES=2 python3 -u llama_delta.py \ + --wbits 2 \ + --true-sequential --act-order --new-eval\ + --groupsize 1024 > delta_2_bits.txt & + +CUDA_VISIBLE_DEVICES=6 python3 -u llama_delta.py \ --groupsize 1024 \ - --delta \ - --benchmark_results "delta.txt" \ -& -CUDA_VISIBLE_DEVICES=5 python3 gptj_delta.py \ + --wbits 4 \ + --true-sequential --act-order --new-eval\ + --sparsify_hard_threshold \ + --fraction_of_zero 0.9 > delta_4bits_sparse_09.txt & + +CUDA_VISIBLE_DEVICES=5 python3 -u llama_delta.py \ --groupsize 1024 \ + --wbits 2 \ + --true-sequential --act-order --new-eval\ --sparsify_hard_threshold \ - --fraction_of_zero 0.9 \ - --delta \ - --benchmark_results "delta_sparse_0.9.txt" -& -CUDA_VISIBLE_DEVICES=2 python3 gptj.py \ + --fraction_of_zero 0.9 > delta_2bits_sparse_099.txt & + +CUDA_VISIBLE_DEVICES=4 python3 -u llama_delta.py \ --groupsize 1024 \ + --wbits 4 \ + --true-sequential --act-order --new-eval\ --sparsify_hard_threshold \ - --fraction_of_zero 0.99 \ - --delta \ - --benchmark_results "delta_sparse_0.99.txt" \ -& -CUDA_VISIBLE_DEVICES=3 python3 gptj.py \ + --fraction_of_zero 0.99 > delta_4bits_sparse_09.txt & + +CUDA_VISIBLE_DEVICES=3 python3 -u llama_delta.py \ --groupsize 1024 \ - --benchmark_results "base.txt" + --wbits 2 \ + --true-sequential --act-order --new-eval\ + --sparsify_hard_threshold \ + --fraction_of_zero 0.99 > delta_2bits_sparse_099.txt & + # & # CUDA_VISIBLE_DEVICES=4 python3 gptj_delta.py \ # --groupsize 1024 \ diff --git a/file_5.txt b/file_5.txt index 256a39d..5ef55d7 100644 --- a/file_5.txt +++ b/file_5.txt @@ -1,5 +1,10 @@ +LLAMA - Experiment results + +------+-----------+--------------------+--------------------+--------------------+ | Bits | n_params | wiki | ptb | c4 | +------+-----------+--------------------+--------------------+--------------------+ | 4 | 107356160 | 2.9947431087493896 | 1.011309266090393 | 1.0010896921157837 | -+------+-----------+--------------------+--------------------+--------------------+ \ No newline at end of file ++------+-----------+--------------------+--------------------+--------------------+ +| 4 | 107356160 | 2.9947431087493896 | 1.011309266090393 | 1.0010896921157837 | ++------+-----------+--------------------+--------------------+--------------------+ + diff --git a/gptj.py b/gptj.py index a98cdd0..0ae4900 100644 --- a/gptj.py +++ b/gptj.py @@ -98,8 +98,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - # print(i, name) - # print('Quantizing ...') + print(i, name) + print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] @@ -142,6 +142,7 @@ def forward(self, inp, **kwargs): cache['i'] += 1 cache['attention_mask'] = kwargs['attention_mask'] raise ValueError + layers[0] = Catcher(layers[0]) for batch in dataloader: try: @@ -188,8 +189,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - # print(i, name) - # print('Quantizing ...') + print(i, name) + print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer gptq[name].free() @@ -259,7 +260,7 @@ def forward(self, inp, **kwargs): attention_mask = cache['attention_mask'] for i in range(len(layers)): - #print(i) + print(i) layer = layers[i].to(dev) if args.nearest: @@ -310,13 +311,13 @@ def gptj_pack(model, quantizers, wbits, groupsize): layers = {n: layers[n] for n in quantizers} make_quant(model, quantizers, wbits, groupsize) qlayers = find_layers(model, [QuantLinear]) - #print('Packing ...') + print('Packing ...') for name in qlayers: print(name) quantizers[name],scale,zero = quantizers[name] quantizers[name],scale,zero = quantizers[name].cpu(),scale.cpu(),zero.cpu() qlayers[name].pack(layers[name], scale, zero) - #print('Done!') + print('Done!') return model def load_quant(model, checkpoint, wbits, groupsize): @@ -383,6 +384,7 @@ def forward(self, *inp, **kwargs): model.gpus = gpus def benchmark(model, input_ids, check=False): + print(model) input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) torch.cuda.synchronize() @@ -392,7 +394,7 @@ def tmp(layer, inp, out): if cache['past']: cache['past'][i] = None return tmp - for i, layer in enumerate(model.model.layers): + for i, layer in enumerate(model.transformer.h): layer.register_forward_hook(clear_past(i)) print('Benchmarking ...') @@ -421,55 +423,40 @@ def sync(): ) sync() times.append(time.time() - tick) - print(i, times[-1]) - max_memory = max(max_memory, torch, torch.cuda.memory_allocated() / 1024 /1024) - if check and i != input_ids.numel() - 1: + if i != input_ids.numel() - 1: tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() - cache['past'] = list(out.past_keys_values) + cache['past'] = list(out.past_key_values) del out sync() import numpy as np print('Median:', np.median(times)) - if check: - print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) - print('max memory(MiB):',max_memory) - - + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) def main(args): + print(args) if args.load: model = load_quant3(args.model, args.load) else: model = get_gptj(args.model) model.eval() - dataloader, testloader = get_loaders( - args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen - ) - ppl = gptj_eval(model, testloader, DEV) - print(ppl) - exit() dataloader, testloader = get_loaders( args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen ) if args.wbits < 16 and not args.nearest: - tick = time.time() + print("Quantizing ...") quantizers = gptj_sequential(model, dataloader, DEV) print(time.time() - tick) if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - gptj_multigpu(model, gpus) - else: - model = model.to(DEV) + model = model.to(DEV) if args.benchmark: input_ids = next(iter(dataloader))[0][:, :args.benchmark] benchmark(model, input_ids, check=args.check) if args.load: exit() - dataloader, testloader = get_loaders( args.dataset, seed=args.seed, model=args.model, seqlen=model.seqlen @@ -490,7 +477,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument( - '--model', type=str, default='togethercomputer/GPT-JT-6B-v1', + '--model', type=str, default='EleutherAI/gpt-j-6b', help='GPT-J model to load; pass `EleutherAI/gpt-j-6b`.' ) parser.add_argument( @@ -545,28 +532,20 @@ def main(args): '--benchmark_results', type=str, default='', help='store benchmark results' ) - parser.add_argument( - '--rank', type=int, default=0, - help='The rank to use for decomposing each matrices' - ) args = parser.parse_args() - results = PrettyTable() - results.field_names = ['Bits', 'wikitext2', 'ptb'] - for n_bits in [4, 3, 2]: + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] + for n_bits in [16]: ppls = [] - for dataset in ['wikitext2', 'ptb']: - print(n_bits) - print(dataset) + for dataset in ['wikitext2']: args.dataset = dataset args.wbits = n_bits args.save = 'gptj-%s-wbits%d.pt' % (dataset, n_bits) ppl = main(args) ppls.append(ppl) - print(ppl) - results.add_row([n_bits] + ppls) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) print(results) with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: f.write(str(results)) - print('finished.') + print('finished.') \ No newline at end of file diff --git a/gptj_delta.py b/gptj_delta.py index d2302e6..490e3e5 100644 --- a/gptj_delta.py +++ b/gptj_delta.py @@ -12,15 +12,6 @@ import os import copy - -def hard_threshold(x, fraction_of_zero=0.1): - y, _ = torch.sort(x.view(-1).abs().clone()) - num_params = torch.numel(x) - thresh_index = int(num_params * fraction_of_zero) - threshold = y[thresh_index] - mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) - return mask * x - def get_gptj(model): import torch def skip(*args, **kwargs): @@ -107,8 +98,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - # print(i, name) - # print('Quantizing ...') + print(i, name) + print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize) for j in range(args.nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] @@ -197,8 +188,8 @@ def tmp(_, inp, out): h.remove() for name in subset: - # print(i, name) - # print('Quantizing ...') + print(i, name) + print('Quantizing ...') gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer gptq[name].free() @@ -253,7 +244,7 @@ def forward(self, inp, **kwargs): for i in range(nsamples): batch = testenc[:, (i * model.seqlen):((i + 1) *model.seqlen)].to(dev) try: - # print(batch.shape) + print(batch.shape) model(batch) except ValueError: pass @@ -361,37 +352,6 @@ def noop(*args, **kwargs): return model -def gptj_multigpu(model, gpus): - model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) - if hasattr(model.model, 'norm') and model.model.norm: - model.model.norm = model.model.norm.to(gpus[-1]) - import copy - model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) - - cache = {'mask': None} - - class MoveModule(nn.Module): - def __init__(self, module): - super().__init__() - self_module = module - self.dev = next(iter(self.module.parameters())).device - def forward(self, *inp, **kwargs): - inp = list(inp) - if inp[0].device != self.dev: - inp[0] = inp[0].to(self.dev) - if cache['mask'] is None or cache ['mask'].device != self.dev: - cache['mask'] = kwargs['attention_mask'].to(self.dev) - kwargs['attention_mask'] = cache['mask'] - tmp = self.module(*inp, **kwargs) - return tmp - - layers = model.model.layers - pergpu = math.ceil(len(layers) / len(gpus)) - for i in range(len(layers)): - layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) - - model.gpus = gpus - def benchmark(model, input_ids, check=False): input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) torch.cuda.synchronize() @@ -402,7 +362,7 @@ def tmp(layer, inp, out): if cache['past']: cache['past'][i] = None return tmp - for i, layer in enumerate(model.model.layers): + for i, layer in enumerate(model.transformer.h): layer.register_forward_hook(clear_past(i)) print('Benchmarking ...') @@ -498,11 +458,7 @@ def main(args): finetuned_p.data = (base_p.data + finetuned_p.data).clone() if args.benchmark: - gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] - if len(gpus) > 1: - gptj_multigpu(model, gpus) - else: - model = model.to(DEV) + model = model.to(DEV) if args.benchmark: input_ids = next(iter(dataloader))[0][:, :args.benchmark] benchmark(model, input_ids, check=args.check) @@ -515,16 +471,15 @@ def main(args): ) ppl = gptj_eval(model, testloader, DEV) - print('perpexity for model is', ppl) + print(ppl) if args.rank > 0: n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Number of params without low rank ", n_params) print("Number of params with low rank", n_params - num_params_saved_lr) if args.save: - # gptj_pack(model, quantizers, args.wbits, args.groupsize) - # torch.save(model.state_dict(), args.save) - pass + gptj_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) return ppl if __name__ == '__main__': @@ -613,29 +568,22 @@ def main(args): '--trits', action='store_true', default=False, help='Whether to use trits' ) - parser.add_argument( - '--rank', type=int, default=0, - help='The rank to use for decomposing each matrices' - ) parser.add_argument('--act_order', type=str, default=False) args = parser.parse_args() results = PrettyTable() - results.field_names = ['Bits', 'wikitext2', 'ptb'] + results.field_names = ['Bits', 'n_params', 'Time', 'wiki'] for n_bits in [4, 3, 2]: ppls = [] - for dataset in ['wikitext2', 'ptb']: - print(n_bits) - print(dataset) + for dataset in ['wikitext2', 'ptb', 'c4']: args.dataset = dataset args.wbits = n_bits args.save = 'gptj-delta-%s-wbits%d.pt' % (dataset, n_bits) ppl = main(args) ppls.append(ppl) - print(ppl) - results.add_row([n_bits] + ppls) + results.add_row([n_bits, n_params, comp_time, ppls[0], ppls[1], ppls[2]]) print(results) with open(os.path.join(os.getcwd(), args.benchmark_results), mode="w") as f: f.write(str(results)) - print('finished.') + print('finished.') \ No newline at end of file diff --git a/llama_delta.py b/llama_delta.py new file mode 100644 index 0000000..d8fa99a --- /dev/null +++ b/llama_delta.py @@ -0,0 +1,441 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * +import copy +import os + +def hard_threshold(x, fraction_of_zero=0.1): + y, _ = torch.sort(x.view(-1).abs().clone()) + num_params = torch.numel(x) + thresh_index = int(num_params * fraction_of_zero) + threshold = y[thresh_index] + mask = x.abs().clone().gt(threshold).type(torch.FloatTensor) + return mask * x + +def get_llama(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 + return model + +@torch.no_grad() +def llama_sequential_delta(model, delta_model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + + layers = model.model.layers + delta_layers = delta_model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + original_outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(delta_layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(i, name) + print('Quantizing ...') + gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=False, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.norm is not None: + model.model.norm = model.model.norm.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.norm is not None: + hidden_states = model.model.norm(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model', type=str, default='ausboss/llama-13b-supercot', + help='LlaMa model to load; pass location of hugginface converted checkpoint.' + ) + parser.add_argument( + '--base-model', type=str, default='yahma/llama-13b-hf', + help='base LLAMA model to load' + ) + parser.add_argument( + '--dataset', type=str, default='wikitext2', choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + parser.add_argument( + '--true-sequential', action='store_true', + help='Whether to run in true sequential model.' + ) + parser.add_argument( + '--sparsify_hard_threshold', action='store_true', + help='Whether to add sparsity' + ) + parser.add_argument( + '--fraction_of_zero', type=float, default=0.99, + help='Sparsity ratio' + ) + args = parser.parse_args() + + base_model = get_llama(args.base_model) + model = get_llama(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + original_finetuned_model = copy.deepcopy(model) + _ = llama_sequential_delta(original_finetuned_model, model, dataloader, DEV) + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + finetuned_p.data = (finetuned_p.data-base_p.data).clone() + + + for base_p, finetuned_p in zip(base_model.parameters(), model.parameters()): + if args.sparsify_hard_threshold: + print('Hard Thresholding...') + W = finetuned_p.data + finetuned_p.data = hard_threshold(W, fraction_of_zero=args.fraction_of_zero) + finetuned_p.data = (base_p.data + finetuned_p.data).clone() + + datasets = ['wikitext2', 'ptb', 'c4'] + if args.new_eval: + datasets = ['wikitext2', 'ptb-new', 'c4-new'] + for dataset in datasets: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + llama_eval(model, testloader, DEV) \ No newline at end of file From c64f2760507fc3583ac53c37c521fe4727bd84be Mon Sep 17 00:00:00 2001 From: kumbong Date: Sun, 14 May 2023 07:15:22 +0000 Subject: [PATCH 09/12] exec --- evaluation.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation.sh b/evaluation.sh index 92d2232..d995499 100755 --- a/evaluation.sh +++ b/evaluation.sh @@ -20,14 +20,14 @@ CUDA_VISIBLE_DEVICES=5 python3 -u llama_delta.py \ --wbits 2 \ --true-sequential --act-order --new-eval\ --sparsify_hard_threshold \ - --fraction_of_zero 0.9 > delta_2bits_sparse_099.txt & + --fraction_of_zero 0.9 > delta_2bits_sparse_09.txt & CUDA_VISIBLE_DEVICES=4 python3 -u llama_delta.py \ --groupsize 1024 \ --wbits 4 \ --true-sequential --act-order --new-eval\ --sparsify_hard_threshold \ - --fraction_of_zero 0.99 > delta_4bits_sparse_09.txt & + --fraction_of_zero 0.99 > delta_4bits_sparse_099.txt & CUDA_VISIBLE_DEVICES=3 python3 -u llama_delta.py \ --groupsize 1024 \ From e327308dc6b760a973536548d322301388833059 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 14 May 2023 07:53:57 +0000 Subject: [PATCH 10/12] some results --- delta_2_bits.txt | 665 ++++++++++++++++++++++++++++++ delta_2bits_sparse_09.txt | 800 ++++++++++++++++++++++++++++++++++++ delta_2bits_sparse_099.txt | 803 ++++++++++++++++++++++++++++++++++++ delta_4bits.txt | 29 -- delta_4bits_sparse_09.txt | 804 ++++++++++++++++++++++++++++++++++++ delta_4bits_sparse_099.txt | 808 +++++++++++++++++++++++++++++++++++++ 6 files changed, 3880 insertions(+), 29 deletions(-) create mode 100644 delta_2bits_sparse_09.txt create mode 100644 delta_4bits_sparse_099.txt diff --git a/delta_2_bits.txt b/delta_2_bits.txt index 41b76c4..f7647d2 100644 --- a/delta_2_bits.txt +++ b/delta_2_bits.txt @@ -30,3 +30,668 @@ Quantizing ... Quantizing ... 2 self_attn.k_proj Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.183259963989258 +Downloading and preparing dataset ptb_text_only/penn_treebank (download: 5.68 MiB, generated: 5.72 MiB, post-processed: Unknown size, total: 11.40 MiB) to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f... +Dataset ptb_text_only downloaded and prepared to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f. Subsequent calls will reuse this data. +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.479209899902344 +Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde... +Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data. +Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde... +Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-efc3d4f4606f44bd/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data. +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.7892680168151855 diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt new file mode 100644 index 0000000..ae0aeb4 --- /dev/null +++ b/delta_2bits_sparse_09.txt @@ -0,0 +1,800 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt index cfb6519..02be0af 100644 --- a/delta_2bits_sparse_099.txt +++ b/delta_2bits_sparse_099.txt @@ -6,3 +6,806 @@ Quantizing ... Quantizing ... 0 self_attn.q_proj Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... diff --git a/delta_4bits.txt b/delta_4bits.txt index 39698f4..4cecc92 100644 --- a/delta_4bits.txt +++ b/delta_4bits.txt @@ -1,30 +1 @@ Starting ... -Ready. -0 self_attn.k_proj -Quantizing ... -0 self_attn.v_proj -Quantizing ... -0 self_attn.q_proj -Quantizing ... -0 self_attn.o_proj -Quantizing ... -0 mlp.up_proj -Quantizing ... -0 mlp.gate_proj -Quantizing ... -0 mlp.down_proj -Quantizing ... -1 self_attn.k_proj -Quantizing ... -1 self_attn.v_proj -Quantizing ... -1 self_attn.q_proj -Quantizing ... -1 self_attn.o_proj -Quantizing ... -1 mlp.up_proj -Quantizing ... -1 mlp.gate_proj -Quantizing ... -1 mlp.down_proj -Quantizing ... diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt index cfb6519..e173fbb 100644 --- a/delta_4bits_sparse_09.txt +++ b/delta_4bits_sparse_09.txt @@ -6,3 +6,807 @@ Quantizing ... Quantizing ... 0 self_attn.q_proj Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt new file mode 100644 index 0000000..80e3ba1 --- /dev/null +++ b/delta_4bits_sparse_099.txt @@ -0,0 +1,808 @@ +Starting ... +Ready. +0 self_attn.k_proj +Quantizing ... +0 self_attn.v_proj +Quantizing ... +0 self_attn.q_proj +Quantizing ... +0 self_attn.o_proj +Quantizing ... +0 mlp.up_proj +Quantizing ... +0 mlp.gate_proj +Quantizing ... +0 mlp.down_proj +Quantizing ... +1 self_attn.k_proj +Quantizing ... +1 self_attn.v_proj +Quantizing ... +1 self_attn.q_proj +Quantizing ... +1 self_attn.o_proj +Quantizing ... +1 mlp.up_proj +Quantizing ... +1 mlp.gate_proj +Quantizing ... +1 mlp.down_proj +Quantizing ... +2 self_attn.k_proj +Quantizing ... +2 self_attn.v_proj +Quantizing ... +2 self_attn.q_proj +Quantizing ... +2 self_attn.o_proj +Quantizing ... +2 mlp.up_proj +Quantizing ... +2 mlp.gate_proj +Quantizing ... +2 mlp.down_proj +Quantizing ... +3 self_attn.k_proj +Quantizing ... +3 self_attn.v_proj +Quantizing ... +3 self_attn.q_proj +Quantizing ... +3 self_attn.o_proj +Quantizing ... +3 mlp.up_proj +Quantizing ... +3 mlp.gate_proj +Quantizing ... +3 mlp.down_proj +Quantizing ... +4 self_attn.k_proj +Quantizing ... +4 self_attn.v_proj +Quantizing ... +4 self_attn.q_proj +Quantizing ... +4 self_attn.o_proj +Quantizing ... +4 mlp.up_proj +Quantizing ... +4 mlp.gate_proj +Quantizing ... +4 mlp.down_proj +Quantizing ... +5 self_attn.k_proj +Quantizing ... +5 self_attn.v_proj +Quantizing ... +5 self_attn.q_proj +Quantizing ... +5 self_attn.o_proj +Quantizing ... +5 mlp.up_proj +Quantizing ... +5 mlp.gate_proj +Quantizing ... +5 mlp.down_proj +Quantizing ... +6 self_attn.k_proj +Quantizing ... +6 self_attn.v_proj +Quantizing ... +6 self_attn.q_proj +Quantizing ... +6 self_attn.o_proj +Quantizing ... +6 mlp.up_proj +Quantizing ... +6 mlp.gate_proj +Quantizing ... +6 mlp.down_proj +Quantizing ... +7 self_attn.k_proj +Quantizing ... +7 self_attn.v_proj +Quantizing ... +7 self_attn.q_proj +Quantizing ... +7 self_attn.o_proj +Quantizing ... +7 mlp.up_proj +Quantizing ... +7 mlp.gate_proj +Quantizing ... +7 mlp.down_proj +Quantizing ... +8 self_attn.k_proj +Quantizing ... +8 self_attn.v_proj +Quantizing ... +8 self_attn.q_proj +Quantizing ... +8 self_attn.o_proj +Quantizing ... +8 mlp.up_proj +Quantizing ... +8 mlp.gate_proj +Quantizing ... +8 mlp.down_proj +Quantizing ... +9 self_attn.k_proj +Quantizing ... +9 self_attn.v_proj +Quantizing ... +9 self_attn.q_proj +Quantizing ... +9 self_attn.o_proj +Quantizing ... +9 mlp.up_proj +Quantizing ... +9 mlp.gate_proj +Quantizing ... +9 mlp.down_proj +Quantizing ... +10 self_attn.k_proj +Quantizing ... +10 self_attn.v_proj +Quantizing ... +10 self_attn.q_proj +Quantizing ... +10 self_attn.o_proj +Quantizing ... +10 mlp.up_proj +Quantizing ... +10 mlp.gate_proj +Quantizing ... +10 mlp.down_proj +Quantizing ... +11 self_attn.k_proj +Quantizing ... +11 self_attn.v_proj +Quantizing ... +11 self_attn.q_proj +Quantizing ... +11 self_attn.o_proj +Quantizing ... +11 mlp.up_proj +Quantizing ... +11 mlp.gate_proj +Quantizing ... +11 mlp.down_proj +Quantizing ... +12 self_attn.k_proj +Quantizing ... +12 self_attn.v_proj +Quantizing ... +12 self_attn.q_proj +Quantizing ... +12 self_attn.o_proj +Quantizing ... +12 mlp.up_proj +Quantizing ... +12 mlp.gate_proj +Quantizing ... +12 mlp.down_proj +Quantizing ... +13 self_attn.k_proj +Quantizing ... +13 self_attn.v_proj +Quantizing ... +13 self_attn.q_proj +Quantizing ... +13 self_attn.o_proj +Quantizing ... +13 mlp.up_proj +Quantizing ... +13 mlp.gate_proj +Quantizing ... +13 mlp.down_proj +Quantizing ... +14 self_attn.k_proj +Quantizing ... +14 self_attn.v_proj +Quantizing ... +14 self_attn.q_proj +Quantizing ... +14 self_attn.o_proj +Quantizing ... +14 mlp.up_proj +Quantizing ... +14 mlp.gate_proj +Quantizing ... +14 mlp.down_proj +Quantizing ... +15 self_attn.k_proj +Quantizing ... +15 self_attn.v_proj +Quantizing ... +15 self_attn.q_proj +Quantizing ... +15 self_attn.o_proj +Quantizing ... +15 mlp.up_proj +Quantizing ... +15 mlp.gate_proj +Quantizing ... +15 mlp.down_proj +Quantizing ... +16 self_attn.k_proj +Quantizing ... +16 self_attn.v_proj +Quantizing ... +16 self_attn.q_proj +Quantizing ... +16 self_attn.o_proj +Quantizing ... +16 mlp.up_proj +Quantizing ... +16 mlp.gate_proj +Quantizing ... +16 mlp.down_proj +Quantizing ... +17 self_attn.k_proj +Quantizing ... +17 self_attn.v_proj +Quantizing ... +17 self_attn.q_proj +Quantizing ... +17 self_attn.o_proj +Quantizing ... +17 mlp.up_proj +Quantizing ... +17 mlp.gate_proj +Quantizing ... +17 mlp.down_proj +Quantizing ... +18 self_attn.k_proj +Quantizing ... +18 self_attn.v_proj +Quantizing ... +18 self_attn.q_proj +Quantizing ... +18 self_attn.o_proj +Quantizing ... +18 mlp.up_proj +Quantizing ... +18 mlp.gate_proj +Quantizing ... +18 mlp.down_proj +Quantizing ... +19 self_attn.k_proj +Quantizing ... +19 self_attn.v_proj +Quantizing ... +19 self_attn.q_proj +Quantizing ... +19 self_attn.o_proj +Quantizing ... +19 mlp.up_proj +Quantizing ... +19 mlp.gate_proj +Quantizing ... +19 mlp.down_proj +Quantizing ... +20 self_attn.k_proj +Quantizing ... +20 self_attn.v_proj +Quantizing ... +20 self_attn.q_proj +Quantizing ... +20 self_attn.o_proj +Quantizing ... +20 mlp.up_proj +Quantizing ... +20 mlp.gate_proj +Quantizing ... +20 mlp.down_proj +Quantizing ... +21 self_attn.k_proj +Quantizing ... +21 self_attn.v_proj +Quantizing ... +21 self_attn.q_proj +Quantizing ... +21 self_attn.o_proj +Quantizing ... +21 mlp.up_proj +Quantizing ... +21 mlp.gate_proj +Quantizing ... +21 mlp.down_proj +Quantizing ... +22 self_attn.k_proj +Quantizing ... +22 self_attn.v_proj +Quantizing ... +22 self_attn.q_proj +Quantizing ... +22 self_attn.o_proj +Quantizing ... +22 mlp.up_proj +Quantizing ... +22 mlp.gate_proj +Quantizing ... +22 mlp.down_proj +Quantizing ... +23 self_attn.k_proj +Quantizing ... +23 self_attn.v_proj +Quantizing ... +23 self_attn.q_proj +Quantizing ... +23 self_attn.o_proj +Quantizing ... +23 mlp.up_proj +Quantizing ... +23 mlp.gate_proj +Quantizing ... +23 mlp.down_proj +Quantizing ... +24 self_attn.k_proj +Quantizing ... +24 self_attn.v_proj +Quantizing ... +24 self_attn.q_proj +Quantizing ... +24 self_attn.o_proj +Quantizing ... +24 mlp.up_proj +Quantizing ... +24 mlp.gate_proj +Quantizing ... +24 mlp.down_proj +Quantizing ... +25 self_attn.k_proj +Quantizing ... +25 self_attn.v_proj +Quantizing ... +25 self_attn.q_proj +Quantizing ... +25 self_attn.o_proj +Quantizing ... +25 mlp.up_proj +Quantizing ... +25 mlp.gate_proj +Quantizing ... +25 mlp.down_proj +Quantizing ... +26 self_attn.k_proj +Quantizing ... +26 self_attn.v_proj +Quantizing ... +26 self_attn.q_proj +Quantizing ... +26 self_attn.o_proj +Quantizing ... +26 mlp.up_proj +Quantizing ... +26 mlp.gate_proj +Quantizing ... +26 mlp.down_proj +Quantizing ... +27 self_attn.k_proj +Quantizing ... +27 self_attn.v_proj +Quantizing ... +27 self_attn.q_proj +Quantizing ... +27 self_attn.o_proj +Quantizing ... +27 mlp.up_proj +Quantizing ... +27 mlp.gate_proj +Quantizing ... +27 mlp.down_proj +Quantizing ... +28 self_attn.k_proj +Quantizing ... +28 self_attn.v_proj +Quantizing ... +28 self_attn.q_proj +Quantizing ... +28 self_attn.o_proj +Quantizing ... +28 mlp.up_proj +Quantizing ... +28 mlp.gate_proj +Quantizing ... +28 mlp.down_proj +Quantizing ... +29 self_attn.k_proj +Quantizing ... +29 self_attn.v_proj +Quantizing ... +29 self_attn.q_proj +Quantizing ... +29 self_attn.o_proj +Quantizing ... +29 mlp.up_proj +Quantizing ... +29 mlp.gate_proj +Quantizing ... +29 mlp.down_proj +Quantizing ... +30 self_attn.k_proj +Quantizing ... +30 self_attn.v_proj +Quantizing ... +30 self_attn.q_proj +Quantizing ... +30 self_attn.o_proj +Quantizing ... +30 mlp.up_proj +Quantizing ... +30 mlp.gate_proj +Quantizing ... +30 mlp.down_proj +Quantizing ... +31 self_attn.k_proj +Quantizing ... +31 self_attn.v_proj +Quantizing ... +31 self_attn.q_proj +Quantizing ... +31 self_attn.o_proj +Quantizing ... +31 mlp.up_proj +Quantizing ... +31 mlp.gate_proj +Quantizing ... +31 mlp.down_proj +Quantizing ... +32 self_attn.k_proj +Quantizing ... +32 self_attn.v_proj +Quantizing ... +32 self_attn.q_proj +Quantizing ... +32 self_attn.o_proj +Quantizing ... +32 mlp.up_proj +Quantizing ... +32 mlp.gate_proj +Quantizing ... +32 mlp.down_proj +Quantizing ... +33 self_attn.k_proj +Quantizing ... +33 self_attn.v_proj +Quantizing ... +33 self_attn.q_proj +Quantizing ... +33 self_attn.o_proj +Quantizing ... +33 mlp.up_proj +Quantizing ... +33 mlp.gate_proj +Quantizing ... +33 mlp.down_proj +Quantizing ... +34 self_attn.k_proj +Quantizing ... +34 self_attn.v_proj +Quantizing ... +34 self_attn.q_proj +Quantizing ... +34 self_attn.o_proj +Quantizing ... +34 mlp.up_proj +Quantizing ... +34 mlp.gate_proj +Quantizing ... +34 mlp.down_proj +Quantizing ... +35 self_attn.k_proj +Quantizing ... +35 self_attn.v_proj +Quantizing ... +35 self_attn.q_proj +Quantizing ... +35 self_attn.o_proj +Quantizing ... +35 mlp.up_proj +Quantizing ... +35 mlp.gate_proj +Quantizing ... +35 mlp.down_proj +Quantizing ... +36 self_attn.k_proj +Quantizing ... +36 self_attn.v_proj +Quantizing ... +36 self_attn.q_proj +Quantizing ... +36 self_attn.o_proj +Quantizing ... +36 mlp.up_proj +Quantizing ... +36 mlp.gate_proj +Quantizing ... +36 mlp.down_proj +Quantizing ... +37 self_attn.k_proj +Quantizing ... +37 self_attn.v_proj +Quantizing ... +37 self_attn.q_proj +Quantizing ... +37 self_attn.o_proj +Quantizing ... +37 mlp.up_proj +Quantizing ... +37 mlp.gate_proj +Quantizing ... +37 mlp.down_proj +Quantizing ... +38 self_attn.k_proj +Quantizing ... +38 self_attn.v_proj +Quantizing ... +38 self_attn.q_proj +Quantizing ... +38 self_attn.o_proj +Quantizing ... +38 mlp.up_proj +Quantizing ... +38 mlp.gate_proj +Quantizing ... +38 mlp.down_proj +Quantizing ... +39 self_attn.k_proj +Quantizing ... +39 self_attn.v_proj +Quantizing ... +39 self_attn.q_proj +Quantizing ... +39 self_attn.o_proj +Quantizing ... +39 mlp.up_proj +Quantizing ... +39 mlp.gate_proj +Quantizing ... +39 mlp.down_proj +Quantizing ... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... From 8d118a7afbeeabd6b78b65305ea8ca976e16cd3e Mon Sep 17 00:00:00 2001 From: root Date: Sun, 14 May 2023 08:22:52 +0000 Subject: [PATCH 11/12] some results --- delta_2bits_sparse_09.txt | 236 +++++++++++++++++++++++++++++++++++++ delta_2bits_sparse_099.txt | 224 +++++++++++++++++++++++++++++++++++ delta_4bits_sparse_09.txt | 226 +++++++++++++++++++++++++++++++++++ delta_4bits_sparse_099.txt | 226 +++++++++++++++++++++++++++++++++++ 4 files changed, 912 insertions(+) diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt index ae0aeb4..0b50db2 100644 --- a/delta_2bits_sparse_09.txt +++ b/delta_2bits_sparse_09.txt @@ -798,3 +798,239 @@ Hard Thresholding... Hard Thresholding... Hard Thresholding... Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.098198890686035 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.15268325805664 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt index 02be0af..9ec985c 100644 --- a/delta_2bits_sparse_099.txt +++ b/delta_2bits_sparse_099.txt @@ -809,3 +809,227 @@ Hard Thresholding... Hard Thresholding... Hard Thresholding... Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.087564945220947 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.305665969848633 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt index e173fbb..581b389 100644 --- a/delta_4bits_sparse_09.txt +++ b/delta_4bits_sparse_09.txt @@ -810,3 +810,229 @@ Hard Thresholding... Hard Thresholding... Hard Thresholding... Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.098198890686035 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.15268325805664 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt index 80e3ba1..8fb4393 100644 --- a/delta_4bits_sparse_099.txt +++ b/delta_4bits_sparse_099.txt @@ -806,3 +806,229 @@ Hard Thresholding... Hard Thresholding... Hard Thresholding... Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +Hard Thresholding... +wikitext2 +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +5.087564945220947 +ptb-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +19.305665969848633 +c4-new +Evaluating ... +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 From 559eef86554c83d669a6a72ebc89feda2663036f Mon Sep 17 00:00:00 2001 From: root Date: Sun, 14 May 2023 08:31:55 +0000 Subject: [PATCH 12/12] even more results --- delta_2bits_sparse_09.txt | 18 ++++++++++++++++++ delta_2bits_sparse_099.txt | 19 +++++++++++++++++++ delta_4bits_sparse_09.txt | 16 ++++++++++++++++ delta_4bits_sparse_099.txt | 20 ++++++++++++++++++++ 4 files changed, 73 insertions(+) diff --git a/delta_2bits_sparse_09.txt b/delta_2bits_sparse_09.txt index 0b50db2..8089e8e 100644 --- a/delta_2bits_sparse_09.txt +++ b/delta_2bits_sparse_09.txt @@ -1034,3 +1034,21 @@ Evaluating ... 20 21 22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.6517863273620605 diff --git a/delta_2bits_sparse_099.txt b/delta_2bits_sparse_099.txt index 9ec985c..a7ba5b7 100644 --- a/delta_2bits_sparse_099.txt +++ b/delta_2bits_sparse_099.txt @@ -1033,3 +1033,22 @@ Evaluating ... 19 20 21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.614907741546631 diff --git a/delta_4bits_sparse_09.txt b/delta_4bits_sparse_09.txt index 581b389..8089e8e 100644 --- a/delta_4bits_sparse_09.txt +++ b/delta_4bits_sparse_09.txt @@ -1036,3 +1036,19 @@ Evaluating ... 22 23 24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.6517863273620605 diff --git a/delta_4bits_sparse_099.txt b/delta_4bits_sparse_099.txt index 8fb4393..a7ba5b7 100644 --- a/delta_4bits_sparse_099.txt +++ b/delta_4bits_sparse_099.txt @@ -1032,3 +1032,23 @@ Evaluating ... 18 19 20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +6.614907741546631