fix: repack for marlin when single scale is provided

2025-10-19 03:45:22 +00:00 · 2024-08-13 16:52:15 -04:00 · 2024-08-13 16:52:15 -04:00 · ab4d480d91
commit ab4d480d91
parent 1cebccc72b
1 changed files with 2 additions and 1 deletions
--- a/server/text_generation_server/layers/marlin/fp8.py
+++ b/server/text_generation_server/layers/marlin/fp8.py
@ -39,7 +39,8 @@ class GPTQMarlinFP8Linear(nn.Module):
        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")
        scales = scales.unsqueeze(0)
-        if scales.shape[1] == 1:
+        # repack weights for Marlin if a single scale is provided
        if scales.size(0) == 1:
            out_features, in_features = qweight.shape
            scales = scales.repeat(1, out_features)
        qweight, scales = repack_fp8_for_marlin(qweight, scales)