Give escape hatch to not use exllama kernels even if available.

This commit is contained in:
Nicolas Patry 2023-07-20 17:47:09 +00:00
parent 8cf7c89910
commit 7faef69015

View File

@ -1,3 +1,4 @@
import os
import torch import torch
import torch.distributed import torch.distributed
@ -17,6 +18,8 @@ from accelerate import init_empty_weights
from text_generation_server.utils.gptq.quant_linear import QuantLinear from text_generation_server.utils.gptq.quant_linear import QuantLinear
HAS_EXLLAMA = True HAS_EXLLAMA = True
if os.getenv("DISABLE_EXLLAMA") == "True":
HAS_EXLLAMA=False
try: try:
from text_generation_server.utils.gptq.exllama import Ex4bitLinear from text_generation_server.utils.gptq.exllama import Ex4bitLinear
except ImportError: except ImportError: