From 211e7b7e3503c7e388c2cadb10132d53b1deb8b0 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 6 Sep 2023 15:01:00 +0200 Subject: [PATCH] Disabling exllama on old compute. (#986) # What does this PR do? Disabling exllama on old compute. Exllama + T4 don't play nice together, this will disable it right away to avoid issues at runtime. Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/utils/layers.py | 17 ++++++++++++----- server/text_generation_server/utils/weights.py | 4 ++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 745c1d2e..6be54048 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -18,13 +18,20 @@ from accelerate import init_empty_weights from text_generation_server.utils.gptq.quant_linear import QuantLinear -HAS_EXLLAMA = True +try: + major, _minor = torch.cuda.get_device_capability() +except Exception: + major = 1 +HAS_EXLLAMA = False +CAN_EXLLAMA = major >= 8 if os.getenv("DISABLE_EXLLAMA") == "True": HAS_EXLLAMA = False -try: - from text_generation_server.utils.gptq.exllama import Ex4bitLinear -except ImportError: - HAS_EXLLAMA = False +elif CAN_EXLLAMA: + try: + from text_generation_server.utils.gptq.exllama import Ex4bitLinear + HAS_EXLLAMA = True + except ImportError: + pass from typing import Optional diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index ef662ce1..261456bd 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -170,10 +170,10 @@ class Weights: "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`" ) - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA if use_exllama: - if not HAS_EXLLAMA: + if not HAS_EXLLAMA and CAN_EXLLAMA: logger.warning( "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True" )