mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
fix: avoid unneeded quantize check
This commit is contained in:
parent
a10f4010d7
commit
6784d5d0a6
@ -2,7 +2,6 @@ import torch
|
|||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
from text_generation_server.layers.linear import get_linear, FastLinear
|
from text_generation_server.layers.linear import get_linear, FastLinear
|
||||||
from text_generation_server.layers.exl2 import Exl2Weight
|
|
||||||
from text_generation_server.utils.import_utils import SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
|
||||||
if SYSTEM == "ipex":
|
if SYSTEM == "ipex":
|
||||||
@ -67,16 +66,6 @@ class TensorParallelHead(SuperLayer):
|
|||||||
weight = weights.get_tensor(f"{prefix}.weight")
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
should_gather = False
|
should_gather = False
|
||||||
|
|
||||||
# GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
|
|
||||||
if config.quantize in ["gptq", "awq", "eetq", "marlin"]:
|
|
||||||
# Local variable `quantize` is assigned to but never used
|
|
||||||
quantize = None # noqa F841
|
|
||||||
# See above, exl2 LM head can be quantized or not.
|
|
||||||
elif config.quantize == "exl2" and not isinstance(weight, Exl2Weight):
|
|
||||||
quantize = None # noqa F841
|
|
||||||
else:
|
|
||||||
quantize = config.quantize # noqa F841
|
|
||||||
|
|
||||||
return TensorParallelHead(
|
return TensorParallelHead(
|
||||||
get_linear(weight, bias=None),
|
get_linear(weight, bias=None),
|
||||||
process_group=weights.process_group,
|
process_group=weights.process_group,
|
||||||
|
Loading…
Reference in New Issue
Block a user