also quant weights with single scale

2025-09-12 04:44:52 +00:00 · 2024-07-22 18:49:10 +02:00 · 2024-07-22 18:49:10 +02:00 · 473f968a01
commit 473f968a01
parent 3d0c7b85fe
1 changed files with 1 additions and 1 deletions
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -203,7 +203,7 @@ class Fp8Linear(torch.nn.Module):

    @classmethod
    def from_unquant(cls, weight, bias, dtype):
-        qweight, scale = fp8_quantize(weight)
+        qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE)
        return cls(
            qweight=qweight, scale=scale, scale_upper_bound=None, bias=bias, dtype=dtype
        )