diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/utils/gptq/quantize.py index 93d86b3a..54d996dd 100644 --- a/server/text_generation_server/utils/gptq/quantize.py +++ b/server/text_generation_server/utils/gptq/quantize.py @@ -19,14 +19,23 @@ DEV = torch.device("cuda:0") class Quantizer(nn.Module): - def __init__(self, shape=1): super(Quantizer, self).__init__() - self.register_buffer('maxq', torch.tensor(0)) - self.register_buffer('scale', torch.zeros(shape)) - self.register_buffer('zero', torch.zeros(shape)) + self.register_buffer("maxq", torch.tensor(0)) + self.register_buffer("scale", torch.zeros(shape)) + self.register_buffer("zero", torch.zeros(shape)) - def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8, trits=False): + def configure( + self, + bits, + perchannel=False, + sym=True, + mse=False, + norm=2.4, + grid=100, + maxshrink=0.8, + trits=False, + ): self.maxq = torch.tensor(2**bits - 1) self.perchannel = perchannel @@ -88,14 +97,16 @@ class Quantizer(nn.Module): self.zero = torch.round(-xmin / self.scale) if self.mse: - best = torch.full([x.shape[0]], float('inf'), device=dev) + best = torch.full([x.shape[0]], float("inf"), device=dev) for i in range(int(self.maxshrink * self.grid)): p = 1 - i / self.grid xmin1 = p * xmin xmax1 = p * xmax scale1 = (xmax1 - xmin1) / self.maxq zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero - q = self._quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) + q = self._quantize( + x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq + ) q -= x q.abs_() q.pow_(self.norm) @@ -142,7 +153,6 @@ class Quantizer(nn.Module): class GPTQ: - def __init__(self, layer, observe=False): self.layer = layer self.dev = self.layer.weight.device @@ -170,12 +180,19 @@ class GPTQ: if len(inp.shape) == 2: inp = inp.unsqueeze(0) tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + if isinstance(self.layer, nn.Linear) or isinstance( + self.layer, transformers.Conv1D + ): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() if isinstance(self.layer, nn.Conv2d): - unfold = nn.Unfold(self.layer.kernel_size, dilation=self.layer.dilation, padding=self.layer.padding, stride=self.layer.stride) + unfold = nn.Unfold( + self.layer.kernel_size, + dilation=self.layer.dilation, + padding=self.layer.padding, + stride=self.layer.stride, + ) inp = unfold(inp) inp = inp.permute([1, 0, 2]) inp = inp.flatten(1) @@ -188,12 +205,19 @@ class GPTQ: def print_loss(self, name, q_weight, weight_error, timecost): table = Texttable() - name += ' ' * (16 - len(name)) + length = 30 + name = ( + (name + " " * (length - len(name))) + if len(name) <= length + else name[:length] + ) - table.header(['name', 'weight_error', 'fp_inp_SNR', 'q_inp_SNR', 'time']) + table.header(["name", "weight_error", "fp_inp_SNR", "q_inp_SNR", "time"]) # assign weight - self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) + self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to( + self.layer.weight.data.dtype + ) if self.inp1 is not None: # quantize input to int8 @@ -207,13 +231,15 @@ class GPTQ: q_SNR = torch_snr_error(q_out, self.out1).item() fp_SNR = torch_snr_error(self.layer(self.inp1), self.out1).item() else: - q_SNR = '-' - fp_SNR = '-' + q_SNR = "-" + fp_SNR = "-" table.add_row([name, weight_error, fp_SNR, q_SNR, timecost]) - print(table.draw().split('\n')[-2]) + print(table.draw().split("\n")[-2]) - def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, name=''): + def fasterquant( + self, blocksize=128, percdamp=0.01, groupsize=-1, actorder=False, name="" + ): self.layer.to(self.dev) W = self.layer.weight.data.clone() @@ -248,7 +274,13 @@ class GPTQ: H[diag, diag] += damp H = torch.linalg.cholesky(H) H = torch.cholesky_inverse(H) - H = torch.linalg.cholesky(H, upper=True) + try: + H = torch.linalg.cholesky(H, upper=True) + except Exception: + # Addition because Falcon fails on h_to_4h + H = torch.linalg.cholesky( + H + 1e-5 * torch.eye(H.shape[0]).to(H.device), upper=True + ) Hinv = H g_idx = [] @@ -272,7 +304,9 @@ class GPTQ: if groupsize != -1: if (i1 + i) % groupsize == 0: - self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) + self.quantizer.find_params( + W[:, (i1 + i) : (i1 + i + groupsize)], weight=True + ) if ((i1 + i) // groupsize) - now_idx == -1: scale.append(self.quantizer.scale) @@ -281,7 +315,7 @@ class GPTQ: q = self.quantizer.quantize(w.unsqueeze(1)).flatten() Q1[:, i] = q - Losses1[:, i] = (w - q)**2 / d**2 + Losses1[:, i] = (w - q) ** 2 / d**2 err1 = (w - q) / d W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) @@ -306,7 +340,9 @@ class GPTQ: if isinstance(self.layer, transformers.Conv1D): Q = Q.t() - self.print_loss(name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick)) + self.print_loss( + name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick) + ) if scale == []: scale.append(self.quantizer.scale) @@ -326,15 +362,18 @@ class GPTQ: def get_wikitext2(nsamples, seed, seqlen, model_id): from datasets import load_dataset - traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') - testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + + traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") + testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) - trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt') - testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt') + trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt") + testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt") import random + random.seed(seed) trainloader = [] for _ in range(nsamples): @@ -349,18 +388,21 @@ def get_wikitext2(nsamples, seed, seqlen, model_id): def get_ptb(nsamples, seed, seqlen, model_id): from datasets import load_dataset - traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') - valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation') + + traindata = load_dataset("ptb_text_only", "penn_treebank", split="train") + valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation") from transformers import AutoTokenizer + try: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) except: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) - trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt') - testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt') + trainenc = tokenizer("\n\n".join(traindata["sentence"]), return_tensors="pt") + testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt") import random + random.seed(seed) trainloader = [] for _ in range(nsamples): @@ -375,22 +417,37 @@ def get_ptb(nsamples, seed, seqlen, model_id): def get_c4(nsamples, seed, seqlen, model_id): from datasets import load_dataset - traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=False) - valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation', use_auth_token=False) + + traindata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", + use_auth_token=False, + ) + valdata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, + split="validation", + use_auth_token=False, + ) from transformers import AutoTokenizer + try: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) except: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) import random + random.seed(seed) trainloader = [] for _ in range(nsamples): while True: i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + trainenc = tokenizer(traindata[i]["text"], return_tensors="pt") if trainenc.input_ids.shape[1] >= seqlen: break i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) @@ -401,12 +458,13 @@ def get_c4(nsamples, seed, seqlen, model_id): trainloader.append((inp, tar)) import random + random.seed(0) valenc = [] for _ in range(256): while True: i = random.randint(0, len(valdata) - 1) - tmp = tokenizer(valdata[i]['text'], return_tensors='pt') + tmp = tokenizer(valdata[i]["text"], return_tensors="pt") if tmp.input_ids.shape[1] >= seqlen: break i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) @@ -415,7 +473,6 @@ def get_c4(nsamples, seed, seqlen, model_id): valenc = torch.hstack(valenc) class TokenizerWrapper: - def __init__(self, input_ids): self.input_ids = input_ids @@ -426,18 +483,21 @@ def get_c4(nsamples, seed, seqlen, model_id): def get_ptb_new(nsamples, seed, seqlen, model_id): from datasets import load_dataset - traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') - testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + + traindata = load_dataset("ptb_text_only", "penn_treebank", split="train") + testdata = load_dataset("ptb_text_only", "penn_treebank", split="test") from transformers import AutoTokenizer + try: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) except: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) - trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt') - testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt') + trainenc = tokenizer(" ".join(traindata["sentence"]), return_tensors="pt") + testenc = tokenizer(" ".join(testdata["sentence"]), return_tensors="pt") import random + random.seed(seed) trainloader = [] for _ in range(nsamples): @@ -452,22 +512,35 @@ def get_ptb_new(nsamples, seed, seqlen, model_id): def get_c4_new(nsamples, seed, seqlen, model_id): from datasets import load_dataset - traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train') - valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation') + + traindata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", + ) + valdata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, + split="validation", + ) from transformers import AutoTokenizer + try: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) except: tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) import random + random.seed(seed) trainloader = [] for _ in range(nsamples): while True: i = random.randint(0, len(traindata) - 1) - trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + trainenc = tokenizer(traindata[i]["text"], return_tensors="pt") if trainenc.input_ids.shape[1] >= seqlen: break i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) @@ -477,11 +550,10 @@ def get_c4_new(nsamples, seed, seqlen, model_id): tar[:, :-1] = -100 trainloader.append((inp, tar)) - valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt') - valenc = valenc.input_ids[:, :(256 * seqlen)] + valenc = tokenizer(" ".join(valdata[:1100]["text"]), return_tensors="pt") + valenc = valenc.input_ids[:, : (256 * seqlen)] class TokenizerWrapper: - def __init__(self, input_ids): self.input_ids = input_ids @@ -490,35 +562,56 @@ def get_c4_new(nsamples, seed, seqlen, model_id): return trainloader, valenc -def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model_id=''): - if 'wikitext2' in name: +def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model_id=""): + if "wikitext2" in name: return get_wikitext2(nsamples, seed, seqlen, model_id) - if 'ptb' in name: - if 'new' in name: + if "ptb" in name: + if "new" in name: return get_ptb_new(nsamples, seed, seqlen, model_id) return get_ptb(nsamples, seed, seqlen, model_id) - if 'c4' in name: - if 'new' in name: + if "c4" in name: + if "new" in name: return get_c4_new(nsamples, seed, seqlen, model_id) return get_c4(nsamples, seed, seqlen, model_id) -def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): +def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""): # Skip last lm_head linear - if type(module) in layers and "lm_head" not in name: + # Need isintance Falcon is inheriting Linear. + if isinstance(module, layers) and "lm_head" not in name: return {name: module} res = {} for name1, child in module.named_children(): - res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1)) + res.update( + find_layers( + child, layers=layers, name=name + "." + name1 if name != "" else name1 + ) + ) return res + @torch.no_grad() -def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, sym: bool=False, act_order: bool = False): - print('Starting ...') +def sequential( + model, + dataloader, + dev, + nsamples, + bits, + groupsize, + percdamp=0.01, + sym: bool = False, + act_order: bool = False, +): + print("Starting ...") use_cache = model.config.use_cache model.config.use_cache = False - layers = model.model.layers + try: + layers = model.model.layers + prefix = "model.layers" + except Exception: + layers = model.transformer.h + prefix = "transformer.h" # embeddings = model.get_input_embeddings() # embeddings = embeddings.to(dev) @@ -528,20 +621,22 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, # layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype - inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev) - cache = {'i': 0, 'attention_mask': None} + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + + cache = {"i": 0} + extra = {} class Catcher(nn.Module): - def __init__(self, module): super().__init__() self.module = module def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - cache['attention_mask'] = kwargs['attention_mask'] - cache['position_ids'] = kwargs['position_ids'] + inps[cache["i"]] = inp + cache["i"] += 1 + extra.update(kwargs.copy()) raise ValueError layers[0] = Catcher(layers[0]) @@ -558,19 +653,22 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, torch.cuda.empty_cache() outs = torch.zeros_like(inps) - attention_mask = cache['attention_mask'].to(dev) - position_ids = cache['position_ids'].to(dev) - print('Ready.') + extra = { + k: v.to(dev) if isinstance(v, torch.Tensor) else v for k, v in extra.items() + } + + print("Ready.") quantizers = {} for i in range(len(layers)): - print(f'Quantizing layer {i+1}/{len(layers)}..') - print('+------------------+--------------+------------+-----------+-------+') - print('| name | weight_error | fp_inp_SNR | q_inp_SNR | time |') - print('+==================+==============+============+===========+=======+') + print(f"Quantizing layer {i+1}/{len(layers)}..") + print("+------------------+--------------+------------+-----------+-------+") + print("| name | weight_error | fp_inp_SNR | q_inp_SNR | time |") + print("+==================+==============+============+===========+=======+") from accelerate.hooks import remove_hook_from_submodules + layer = layers[i].to(dev) remove_hook_from_submodules(layer) full = find_layers(layer) @@ -581,10 +679,11 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, gptq = {} for name in subset: gptq[name] = GPTQ(subset[name]) - gptq[name].quantizer.configure(bits, perchannel=True, sym=sym, mse=False) + gptq[name].quantizer.configure( + bits, perchannel=True, sym=sym, mse=False + ) def add_batch(name): - def tmp(_, inp, out): gptq[name].add_batch(inp[0].data, out.data) @@ -595,18 +694,30 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, handles.append(subset[name].register_forward_hook(add_batch(name))) for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + outs[j] = layer(inps[j].unsqueeze(0), **extra)[0] for h in handles: h.remove() for name in subset: - scale, zero, g_idx, error = gptq[name].fasterquant(percdamp=percdamp, groupsize=groupsize, actorder=act_order, name=name) - quantizers['model.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu(), bits, groupsize) + scale, zero, g_idx, error = gptq[name].fasterquant( + percdamp=percdamp, + groupsize=groupsize, + actorder=act_order, + name=name, + ) + quantizers[f"{prefix}.{i}.{name}"] = ( + gptq[name].quantizer.cpu(), + scale.cpu(), + zero.cpu(), + g_idx.cpu(), + bits, + groupsize, + ) gptq[name].free() for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] + outs[j] = layer(inps[j].unsqueeze(0), **extra)[0] layers[i] = layer.cpu() del layer @@ -614,8 +725,8 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, torch.cuda.empty_cache() inps, outs = outs, inps - print('+------------------+--------------+------------+-----------+-------+') - print('\n') + print("+------------------+--------------+------------+-----------+-------+") + print("\n") # if args.observe: # observer.print() @@ -659,34 +770,34 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, # @torch.no_grad() # def llama_eval(model, testenc, dev): # print('Evaluating ...') -# +# # testenc = testenc.input_ids # nsamples = testenc.numel() // model.seqlen -# +# # use_cache = model.config.use_cache # model.config.use_cache = False # layers = model.model.layers -# +# # model.model.embed_tokens = model.model.embed_tokens.to(dev) # layers[0] = layers[0].to(dev) -# +# # dtype = next(iter(model.parameters())).dtype # inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev) # cache = {'i': 0, 'attention_mask': None} -# +# # class Catcher(nn.Module): -# +# # def __init__(self, module): # super().__init__() # self.module = module -# +# # def forward(self, inp, **kwargs): # inps[cache['i']] = inp # cache['i'] += 1 # cache['attention_mask'] = kwargs['attention_mask'] # cache['position_ids'] = kwargs['position_ids'] # raise ValueError -# +# # layers[0] = Catcher(layers[0]) # for i in range(nsamples): # batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) @@ -695,19 +806,19 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, # except ValueError: # pass # layers[0] = layers[0].module -# +# # layers[0] = layers[0].cpu() # model.model.embed_tokens = model.model.embed_tokens.cpu() # torch.cuda.empty_cache() -# +# # outs = torch.zeros_like(inps) # attention_mask = cache['attention_mask'] # position_ids = cache['position_ids'] -# +# # for i in range(len(layers)): # print(i) # layer = layers[i].to(dev) -# +# # if args.nearest: # subset = find_layers(layer) # for name in subset: @@ -716,18 +827,18 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, # W = subset[name].weight.data # quantizer.find_params(W, weight=True) # subset[name].weight.data = quantizer.quantize(W).to(next(iter(layer.parameters())).dtype) -# +# # for j in range(nsamples): # outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] # layers[i] = layer.cpu() # del layer # torch.cuda.empty_cache() # inps, outs = outs, inps -# +# # if model.model.norm is not None: # model.model.norm = model.model.norm.to(dev) # model.lm_head = model.lm_head.to(dev) -# +# # testenc = testenc.to(dev) # nlls = [] # for i in range(nsamples): @@ -743,21 +854,33 @@ def sequential(model, dataloader, dev, nsamples, bits, groupsize, percdamp=0.01, # nlls.append(neg_log_likelihood) # ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) # print(ppl.item()) -# +# # model.config.use_cache = use_cache -def make_quant_linear(module, names, bits, groupsize, name=''): + +def make_quant_linear(module, names, bits, groupsize, name=""): if isinstance(module, QuantLinear): return for attr in dir(module): tmp = getattr(module, attr) - name1 = name + '.' + attr if name != '' else attr + name1 = name + "." + attr if name != "" else attr if name1 in names: delattr(module, attr) - setattr(module, attr, QuantLinear.new(bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None)) + setattr( + module, + attr, + QuantLinear.new( + bits, + groupsize, + tmp.in_features, + tmp.out_features, + tmp.bias is not None, + ), + ) for name1, child in module.named_children(): - make_quant_linear(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) - + make_quant_linear( + child, names, bits, groupsize, name + "." + name1 if name != "" else name1 + ) # TODO: perform packing on GPU @@ -765,27 +888,27 @@ def pack(model, quantizers, bits, groupsize): layers = find_layers(model) layers = {n: layers[n] for n in quantizers} make_quant_linear(model, quantizers, bits, groupsize) - qlayers = find_layers(model, [QuantLinear]) - print('Packing ...') + qlayers = find_layers(model, (QuantLinear,)) + print("Packing ...") for name in qlayers: print(name) quantizers[name], scale, zero, g_idx, _, _ = quantizers[name] qlayers[name].pack(layers[name], scale, zero, g_idx) - print('Done.') + print("Done.") return model # def load_quant(model, checkpoint, bits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True): # from transformers import LlamaConfig, LlamaForCausalLM, modeling_utils # config = LlamaConfig.from_pretrained(model) -# +# # def noop(*args, **kwargs): # pass -# +# # torch.nn.init.kaiming_uniform_ = noop # torch.nn.init.uniform_ = noop # torch.nn.init.normal_ = noop -# +# # torch.set_default_dtype(torch.half) # modeling_utils._init_weights = False # torch.set_default_dtype(torch.half) @@ -798,29 +921,29 @@ def pack(model, quantizers, bits, groupsize): # if name in layers: # del layers[name] # quant.make_quant_linear(model, layers, bits, groupsize) -# +# # del layers -# +# # print('Loading model ...') # if checkpoint.endswith('.safetensors'): # from safetensors.torch import load_file as safe_load # model.load_state_dict(safe_load(checkpoint)) # else: # model.load_state_dict(torch.load(checkpoint)) -# +# # if eval: # quant.make_quant_attn(model) # quant.make_quant_norm(model) # if fused_mlp: # quant.make_fused_mlp(model) -# +# # if warmup_autotune: # quant.autotune_warmup_linear(model, transpose=not (eval)) # if eval and fused_mlp: # quant.autotune_warmup_fused(model) # model.seqlen = 2048 # print('Done.') -# +# # return model @@ -830,33 +953,33 @@ def pack(model, quantizers, bits, groupsize): # model.model.norm = model.model.norm.to(gpus[0]) # import copy # model.lm_head = copy.deepcopy(model.lm_head).to(gpus[0]) -# +# # cache = {'mask': None, 'position_ids': None} -# +# # class MoveModule(nn.Module): -# +# # def __init__(self, module, invalidate_cache): # super().__init__() # self.module = module # self.dev = next(iter(self.module.parameters())).device # self.invalidate_cache=invalidate_cache -# +# # def forward(self, *inp, **kwargs): # inp = list(inp) # if inp[0].device != self.dev: # inp[0] = inp[0].to(self.dev) -# +# # if cache['mask'] is None or cache['mask'].device != self.dev or self.invalidate_cache: # cache['mask'] = kwargs['attention_mask'].to(self.dev) # kwargs['attention_mask'] = cache['mask'] -# +# # if cache['position_ids'] is None or cache['position_ids'].device != self.dev or self.invalidate_cache: # cache['position_ids'] = kwargs['position_ids'].to(self.dev) # kwargs['position_ids'] = cache['position_ids'] -# +# # tmp = self.module(*inp, **kwargs) # return tmp -# +# # layers = model.model.layers # from math import ceil # if not gpu_dist: @@ -868,49 +991,49 @@ def pack(model, quantizers, bits, groupsize): # assigned_gpus = [0] * (gpu_dist[0]-1) # for i in range(1, len(gpu_dist)): # assigned_gpus = assigned_gpus + [i] * gpu_dist[i] -# +# # remaining_assignments = len(layers)-len(assigned_gpus) - 1 # if remaining_assignments > 0: # assigned_gpus = assigned_gpus + [-1] * remaining_assignments -# +# # assigned_gpus = assigned_gpus + [0] -# +# # for i in range(len(layers)): # layers[i] = MoveModule(layers[i].to(gpus[assigned_gpus[i]]), i==0) -# +# # model.gpus = gpus -# -# +# +# # def benchmark(model, input_ids, check=False): # input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) # torch.cuda.synchronize() -# +# # cache = {'past': None} -# +# # def clear_past(i): -# +# # def tmp(layer, inp, out): # if cache['past']: # cache['past'][i] = None -# +# # return tmp -# +# # for i, layer in enumerate(model.model.layers): # layer.register_forward_hook(clear_past(i)) -# +# # print('Benchmarking ...') -# +# # if check: # loss = nn.CrossEntropyLoss() # tot = 0. -# +# # def sync(): # if hasattr(model, 'gpus'): # for gpu in model.gpus: # torch.cuda.synchronize(gpu) # else: # torch.cuda.synchronize() -# +# # max_memory = 0 # with torch.no_grad(): # attention_mask = torch.ones((1, input_ids.numel()), device=DEV) @@ -937,9 +1060,16 @@ def pack(model, quantizers, bits, groupsize): # print('max memory(MiB):', max_memory) -def quantize(model_id: str, bits: int, groupsize: int, output_dir: str, trust_remote_code: bool): +def quantize( + model_id: str, bits: int, groupsize: int, output_dir: str, trust_remote_code: bool +): print("loading model") - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="balanced_low_0", trust_remote_code=trust_remote_code) + model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.float16, + device_map="balanced_low_0", + trust_remote_code=trust_remote_code, + ) print("LOADED model") model.seqlen = 2048 @@ -947,8 +1077,9 @@ def quantize(model_id: str, bits: int, groupsize: int, output_dir: str, trust_re nsamples = 128 seed = None - - dataloader, testloader = get_loaders(dataset, nsamples=nsamples, seed=seed, model_id=model_id, seqlen=model.seqlen) + dataloader, testloader = get_loaders( + dataset, nsamples=nsamples, seed=seed, model_id=model_id, seqlen=model.seqlen + ) tick = time.time() quantizers = sequential(model, dataloader, DEV, nsamples, bits, groupsize) @@ -972,7 +1103,7 @@ def quantize(model_id: str, bits: int, groupsize: int, output_dir: str, trust_re # dataloader, testloader = get_loaders(dataset, seed=args.seed, model=args.model, seqlen=model.seqlen) # print(dataset) # llama_eval(model, testloader, DEV) - # + # # if args.test_generation: # gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] # if len(gpus) > 1: @@ -986,8 +1117,7 @@ def quantize(model_id: str, bits: int, groupsize: int, output_dir: str, trust_re # streamer = TextStreamer(tokenizer) # with torch.no_grad(): # generated_ids = model.generate(input_ids, streamer=streamer) - # - + # # if args.quant_directory is not None: # export_quant_table(quantizers, args.quant_directory) @@ -1000,16 +1130,27 @@ def quantize(model_id: str, bits: int, groupsize: int, output_dir: str, trust_re pack(model, quantizers, bits, groupsize) from safetensors.torch import save_file from transformers.modeling_utils import shard_checkpoint + state_dict = model.state_dict() state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()} state_dict["gptq_bits"] = torch.LongTensor([bits]) state_dict["gptq_groupsize"] = torch.LongTensor([groupsize]) max_shard_size = "10GB" - shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors") + shards, index = shard_checkpoint( + state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors" + ) os.makedirs(output_dir, exist_ok=True) for shard_file, shard in shards.items(): - save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt", "quantized": "gptq", "origin": "text-generation-inference"}) + save_file( + shard, + os.path.join(output_dir, shard_file), + metadata={ + "format": "pt", + "quantized": "gptq", + "origin": "text-generation-inference", + }, + ) if index is None: path_to_weights = os.path.join(output_dir, "model.safetensors") logger.info(f"Model weights saved in {path_to_weights}")