use marlin even on 89

2025-09-12 04:44:52 +00:00 · 2024-07-23 10:35:32 +02:00 · 2024-07-23 10:35:32 +02:00 · 025f80dfd4
commit 025f80dfd4
parent 473f968a01
1 changed files with 5 additions and 2 deletions
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -32,8 +32,8 @@ def get_fp8_linear() -> torch.nn.Module:
    """

    if SYSTEM == "cuda":
-        major, minor = torch.cuda.get_device_capability()
-        if major == 8 and minor < 9:
+        major, _ = torch.cuda.get_device_capability()
+        if major == 8:
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear

            return GPTQMarlinFP8Linear
@ -188,6 +188,9 @@ class Fp8Linear(torch.nn.Module):
        dtype,
    ) -> None:
        super().__init__()
+        if FBGEMM_MM_AVAILABLE:
+            log_once(logger.info, "Using FBGEMM fp8 optimized kernels")
+
        self.dtype = dtype
        self.qweight = qweight
        self.scale = scale