From f951a8b48db4f368e22956b05effc83cffafc454 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 16 Jan 2025 11:06:11 +0000
Subject: [PATCH] Do not convert weight scale to e4m3fnuz on CUDA

---
 .../text_generation_server/layers/compressed_tensors/w8an_fp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
index 15bdce08..ebcc06d6 100644
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@@ -147,7 +147,7 @@ class W8ANFpLoader(WeightsLoader):
                 else None
             )
 
-        if self.load_weight_scale or SYSTEM == "rocm":
+        if self.load_weight_scale and SYSTEM == "rocm":
             w, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                 w, weight_scale, input_scale
             )