Fix GQA llama + AWQ

2025-09-10 20:04:52 +00:00 · 2023-09-26 06:26:23 +00:00 · 2023-09-26 06:26:23 +00:00 · 1ab173a260
commit 1ab173a260
parent c5de7cd886
1 changed files with 1 additions and 1 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -179,7 +179,7 @@ def _load_gqa(config, prefix: str, weights):
        dim=0,
    )

-    if config.quantize != "gptq":
+    if config.quantize not in ["gptq", "awq"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads