From 6784d5d0a6e679cc99553412b41908031a8d7d20 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 25 Jul 2024 15:05:26 +0000 Subject: [PATCH] fix: avoid unneeded quantize check --- .../text_generation_server/layers/tensor_parallel.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py index 9516182c..13f12ef1 100644 --- a/server/text_generation_server/layers/tensor_parallel.py +++ b/server/text_generation_server/layers/tensor_parallel.py @@ -2,7 +2,6 @@ import torch from torch.nn import functional as F from typing import Iterable, List from text_generation_server.layers.linear import get_linear, FastLinear -from text_generation_server.layers.exl2 import Exl2Weight from text_generation_server.utils.import_utils import SYSTEM if SYSTEM == "ipex": @@ -67,16 +66,6 @@ class TensorParallelHead(SuperLayer): weight = weights.get_tensor(f"{prefix}.weight") should_gather = False - # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings) - if config.quantize in ["gptq", "awq", "eetq", "marlin"]: - # Local variable `quantize` is assigned to but never used - quantize = None # noqa F841 - # See above, exl2 LM head can be quantized or not. - elif config.quantize == "exl2" and not isinstance(weight, Exl2Weight): - quantize = None # noqa F841 - else: - quantize = config.quantize # noqa F841 - return TensorParallelHead( get_linear(weight, bias=None), process_group=weights.process_group,