From 9ffe1f1e67f1b5f7de56ff1d8898ee4e528aa50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 5 Jun 2024 10:45:47 +0200 Subject: [PATCH] Do not initialize scratch space when there are no ExLlamaV2 layers (#2015) # What does this PR do? Do not attempt to allocate ExLlamaV2 scratch buffers when there are no ExLlama2 layers. Avoids a crash in warmup for models that cannot use exllama when ExLlamaV2 is installed. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/layers/gptq/exllamav2.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py index 16a3eb89..4d45822b 100644 --- a/server/text_generation_server/layers/gptq/exllamav2.py +++ b/server/text_generation_server/layers/gptq/exllamav2.py @@ -145,6 +145,11 @@ def set_device(device): def create_exllama_buffers(max_total_tokens: int): global LAYERS, DEVICE + # No need to initialize scratch space if there are no layers + # that use ExLLamav2. + if len(LAYERS) == 0: + return + # Find the size of the scratch space. scratch_bytes = max( layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)