Disabling exllama on old compute.

This commit is contained in:
Nicolas Patry 2023-09-06 14:20:03 +02:00
parent c8bbbd8129
commit 1987d37603
2 changed files with 11 additions and 7 deletions

View File

@ -18,13 +18,17 @@ from accelerate import init_empty_weights
from text_generation_server.utils.gptq.quant_linear import QuantLinear
HAS_EXLLAMA = True
major, _minor = torch.cuda.get_device_capability()
HAS_EXLLAMA = False
CAN_EXLLAMA = major >= 8
if os.getenv("DISABLE_EXLLAMA") == "True":
HAS_EXLLAMA = False
try:
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
except ImportError:
HAS_EXLLAMA = False
elif CAN_EXLLAMA:
try:
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
HAS_EXLLAMA = True
except ImportError:
pass
from typing import Optional

View File

@ -170,10 +170,10 @@ class Weights:
"Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
)
from text_generation_server.utils.layers import HAS_EXLLAMA
from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA
if use_exllama:
if not HAS_EXLLAMA:
if not HAS_EXLLAMA and CAN_EXLLAMA:
logger.warning(
"Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
)