mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 11:24:53 +00:00
some gptq case could not be handled by ipex. but could be handle by triton
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
24c2bff659
commit
f1d7192fd4
@ -12,11 +12,7 @@ from text_generation_server.utils.weights import (
|
|||||||
WeightsLoader,
|
WeightsLoader,
|
||||||
DefaultWeightsLoader,
|
DefaultWeightsLoader,
|
||||||
)
|
)
|
||||||
|
import math
|
||||||
if SYSTEM == "ipex":
|
|
||||||
from .ipex import QuantLinear
|
|
||||||
elif SYSTEM in {"cuda", "rocm"}:
|
|
||||||
from .triton import QuantLinear
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -70,6 +66,19 @@ class GPTQWeight(Weight):
|
|||||||
|
|
||||||
return ExllamaQuantLinear(self, bias)
|
return ExllamaQuantLinear(self, bias)
|
||||||
else:
|
else:
|
||||||
|
if SYSTEM == "ipex" and not (
|
||||||
|
self.device.type == "xpu"
|
||||||
|
and (
|
||||||
|
self.bits != 4
|
||||||
|
or math.ceil(
|
||||||
|
(self.qweight.shape[0] * 32 // self.bits) / self.groupsize
|
||||||
|
)
|
||||||
|
!= self.scales.shape[0]
|
||||||
|
)
|
||||||
|
):
|
||||||
|
from .ipex import QuantLinear
|
||||||
|
else:
|
||||||
|
from .triton import QuantLinear
|
||||||
return QuantLinear(
|
return QuantLinear(
|
||||||
self.qweight,
|
self.qweight,
|
||||||
self.qzeros,
|
self.qzeros,
|
||||||
|
@ -202,7 +202,11 @@ def matmul_248_kernel(
|
|||||||
|
|
||||||
|
|
||||||
def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
|
def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
|
||||||
with torch.cuda.device(input.device):
|
with (
|
||||||
|
torch.xpu.device(input.device)
|
||||||
|
if torch.xpu.is_available()
|
||||||
|
else torch.cuda.device(input.device)
|
||||||
|
):
|
||||||
output = torch.empty(
|
output = torch.empty(
|
||||||
(input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
|
(input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user