mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 11:54:52 +00:00
Give escape hatch to not use exllama kernels even if available.
This commit is contained in:
parent
8cf7c89910
commit
7faef69015
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
|
||||||
@ -17,6 +18,8 @@ from accelerate import init_empty_weights
|
|||||||
|
|
||||||
from text_generation_server.utils.gptq.quant_linear import QuantLinear
|
from text_generation_server.utils.gptq.quant_linear import QuantLinear
|
||||||
HAS_EXLLAMA = True
|
HAS_EXLLAMA = True
|
||||||
|
if os.getenv("DISABLE_EXLLAMA") == "True":
|
||||||
|
HAS_EXLLAMA=False
|
||||||
try:
|
try:
|
||||||
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
|
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
Loading…
Reference in New Issue
Block a user