mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
fix: avoid unneeded quantize check
This commit is contained in:
parent
a10f4010d7
commit
6784d5d0a6
@ -2,7 +2,6 @@ import torch
|
||||
from torch.nn import functional as F
|
||||
from typing import Iterable, List
|
||||
from text_generation_server.layers.linear import get_linear, FastLinear
|
||||
from text_generation_server.layers.exl2 import Exl2Weight
|
||||
from text_generation_server.utils.import_utils import SYSTEM
|
||||
|
||||
if SYSTEM == "ipex":
|
||||
@ -67,16 +66,6 @@ class TensorParallelHead(SuperLayer):
|
||||
weight = weights.get_tensor(f"{prefix}.weight")
|
||||
should_gather = False
|
||||
|
||||
# GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
|
||||
if config.quantize in ["gptq", "awq", "eetq", "marlin"]:
|
||||
# Local variable `quantize` is assigned to but never used
|
||||
quantize = None # noqa F841
|
||||
# See above, exl2 LM head can be quantized or not.
|
||||
elif config.quantize == "exl2" and not isinstance(weight, Exl2Weight):
|
||||
quantize = None # noqa F841
|
||||
else:
|
||||
quantize = config.quantize # noqa F841
|
||||
|
||||
return TensorParallelHead(
|
||||
get_linear(weight, bias=None),
|
||||
process_group=weights.process_group,
|
||||
|
Loading…
Reference in New Issue
Block a user