add multi-weight for GPTQ weight loader

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 03:14:53 +00:00 · 2025-05-26 23:21:59 -07:00 · 2025-05-26 23:21:59 -07:00 · 475f6e21bc
commit 475f6e21bc
parent ce8978f9ea
1 changed files with 57 additions and 0 deletions
--- a/backends/gaudi/server/text_generation_server/layers/gptq/init.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/init.py
@ -276,6 +276,63 @@ class GPTQWeightsLoader(WeightsLoader):
            use_exllama=use_exllama,
        )

+    def get_multi_weights(self, weights: Weights, prefixes: List[str], dim: int):
+        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_multi_weights(weights, prefixes, dim)
+        try:
+            qweight = torch.cat(
+                [weights.get_tensor(f"{p}.qweight") for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat([weights.get_tensor(f"{p}.scales") for p in prefixes], dim=1)
+
+        self._get_gptq_params(weights)
+
+        qzeros = torch.cat([weights.get_tensor(f"{p}.qzeros") for p in prefixes], dim=1)
+
+        use_exllama = self.bits == 4 and self.quantize == "gptq" and not self.desc_act
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
    def get_weights_row(self, weights: Weights, prefix: str):
        self._get_gptq_params(weights)