post process exllama model

2025-09-11 12:24:53 +00:00 · 2024-02-01 12:48:17 +01:00 · 2024-02-01 12:48:17 +01:00 · aa2014fc79
commit aa2014fc79
parent 75086526d3
1 changed files with 20 additions and 13 deletions
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -63,9 +63,16 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
    async def Warmup(self, request, context):
-        if self.quantize == "gptq":
+        if self.quantize in ["gptq", "awq"]:
            has_exllama_layers = False
            for _, module in self.model.model.named_modules():
                if hasattr(module, "QUANT_TYPE"):
                    has_exllama_layers = True
                    break
            if has_exllama_layers:
                try:
-                # When using GPTQ, Exllama kernels need some global kernels
+                    # When using GPTQ or AWQ, Exllama kernels need some global kernels
                    # For which we have the finale shapes only after the model has loaded
                    # This will allocate those buffers.
                    from text_generation_server.utils.layers import (