mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
Fixing import exl2
This commit is contained in:
parent
01a515dea2
commit
5f002c678f
@ -8,34 +8,6 @@ from text_generation_server.utils.import_utils import SYSTEM
|
|||||||
from text_generation_server.utils.log import log_once
|
from text_generation_server.utils.log import log_once
|
||||||
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
||||||
|
|
||||||
try:
|
|
||||||
major, _minor = torch.cuda.get_device_capability()
|
|
||||||
except Exception:
|
|
||||||
major = 1
|
|
||||||
|
|
||||||
HAS_EXLLAMA = False
|
|
||||||
CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
|
|
||||||
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
|
|
||||||
if os.getenv("DISABLE_EXLLAMA") == "True":
|
|
||||||
HAS_EXLLAMA = False
|
|
||||||
elif CAN_EXLLAMA:
|
|
||||||
try:
|
|
||||||
if V2:
|
|
||||||
from text_generation_server.layers.gptq.exllamav2 import (
|
|
||||||
QuantLinear as ExllamaQuantLinear, # noqa: F401
|
|
||||||
)
|
|
||||||
|
|
||||||
HAS_EXLLAMA = "2"
|
|
||||||
else:
|
|
||||||
from text_generation_server.layers.gptq.exllama import (
|
|
||||||
Ex4bitLinear as ExllamaQuantLinear, # noqa: F401
|
|
||||||
)
|
|
||||||
|
|
||||||
HAS_EXLLAMA = "1"
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GPTQWeight(Weight):
|
class GPTQWeight(Weight):
|
||||||
@ -432,3 +404,33 @@ class GPTQWeightsLoader(WeightsLoader):
|
|||||||
else False
|
else False
|
||||||
)
|
)
|
||||||
self.quant_method = "gptq"
|
self.quant_method = "gptq"
|
||||||
|
|
||||||
|
|
||||||
|
# Needs to be at the end because circular import.
|
||||||
|
try:
|
||||||
|
major, _minor = torch.cuda.get_device_capability()
|
||||||
|
except Exception:
|
||||||
|
major = 1
|
||||||
|
|
||||||
|
HAS_EXLLAMA = False
|
||||||
|
CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
|
||||||
|
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
|
||||||
|
if os.getenv("DISABLE_EXLLAMA") == "True":
|
||||||
|
HAS_EXLLAMA = False
|
||||||
|
elif CAN_EXLLAMA:
|
||||||
|
try:
|
||||||
|
if V2:
|
||||||
|
from text_generation_server.layers.gptq.exllamav2 import (
|
||||||
|
QuantLinear as ExllamaQuantLinear, # noqa: F401
|
||||||
|
)
|
||||||
|
|
||||||
|
HAS_EXLLAMA = "2"
|
||||||
|
else:
|
||||||
|
from text_generation_server.layers.gptq.exllama import (
|
||||||
|
Ex4bitLinear as ExllamaQuantLinear, # noqa: F401
|
||||||
|
)
|
||||||
|
|
||||||
|
HAS_EXLLAMA = "1"
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
Loading…
Reference in New Issue
Block a user