From 473f968a0131e2979a8ae7a8e05d586be89e4826 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Mon, 22 Jul 2024 18:49:10 +0200
Subject: [PATCH] also quant weights with single scale

---
 server/text_generation_server/layers/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index 96d5f4a3..d2c46c58 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -203,7 +203,7 @@ class Fp8Linear(torch.nn.Module):
 
     @classmethod
     def from_unquant(cls, weight, bias, dtype):
-        qweight, scale = fp8_quantize(weight)
+        qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE)
         return cls(
             qweight=qweight, scale=scale, scale_upper_bound=None, bias=bias, dtype=dtype
         )