From aa2014fc79cb3a5e7764bcf5d383dda8a47179c0 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 1 Feb 2024 12:48:17 +0100 Subject: [PATCH] post process exllama model --- server/text_generation_server/server.py | 33 +++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index d5adbd32..08d672f3 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -63,20 +63,27 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb()) async def Warmup(self, request, context): - if self.quantize == "gptq": - try: - # When using GPTQ, Exllama kernels need some global kernels - # For which we have the finale shapes only after the model has loaded - # This will allocate those buffers. - from text_generation_server.utils.layers import ( - create_exllama_buffers, - set_device, - ) + if self.quantize in ["gptq", "awq"]: + has_exllama_layers = False + for _, module in self.model.model.named_modules(): + if hasattr(module, "QUANT_TYPE"): + has_exllama_layers = True + break - set_device(self.model.device) - create_exllama_buffers(request.max_prefill_tokens) - except ImportError: - pass + if has_exllama_layers: + try: + # When using GPTQ or AWQ, Exllama kernels need some global kernels + # For which we have the finale shapes only after the model has loaded + # This will allocate those buffers. + from text_generation_server.utils.layers import ( + create_exllama_buffers, + set_device, + ) + + set_device(self.model.device) + create_exllama_buffers(request.max_prefill_tokens) + except ImportError: + pass if ( self.model.batch_type == IdeficsCausalLMBatch