avoid circular import and fix dockerfile

2025-09-12 04:44:52 +00:00 · 2024-07-19 18:56:41 +02:00 · 2024-07-19 18:56:41 +02:00 · 10cd8ab4a6
commit 10cd8ab4a6
parent 985df12c46
2 changed files with 14 additions and 1 deletions
--- a/3
+++ b/3
@ -167,6 +167,9 @@ FROM kernel-builder AS fbgemm-builder
 WORKDIR /usr/src

 COPY server/Makefile-fbgemm Makefile
+COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
+COPY server/fix_torch90a.sh fix_torch90a.sh
+
 RUN make build-fbgemm

 # Build vllm CUDA kernels
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@ -7,7 +7,6 @@ from typing import Dict, List, Optional, Union
 from safetensors import safe_open
 from dataclasses import dataclass

-from text_generation_server.layers.fp8 import Fp8Weight
 from text_generation_server.utils.import_utils import SYSTEM


@ -126,10 +125,15 @@ class DefaultWeightsLoader(WeightsLoader):
        )

        if w.dtype == torch.float8_e4m3fn:
+            # FIXME: here to avoid circular import
+            from text_generation_server.layers.fp8 import Fp8Weight
+
            if self.weight_class is not None and self.weight_class != Fp8Weight:
                raise RuntimeError(
                    f"Deserialized quantised fp8 weights but weight class is {self.weight_class}"
                )
+            # FIXME: here to avoid circular import
+            from text_generation_server.layers.fp8 import Fp8Weight

            # FP8 branch
            scale = weights.get_packed_sharded(
@ -148,6 +152,9 @@ class DefaultWeightsLoader(WeightsLoader):

        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
+            # FIXME: here to avoid circular import
+            from text_generation_server.layers.fp8 import Fp8Weight
+
            if self.weight_class is not None and self.weight_class != Fp8Weight:
                raise RuntimeError(
                    f"Deserialized quantised fp8 weights but weight class is {self.weight_class}"
@ -166,6 +173,9 @@ class DefaultWeightsLoader(WeightsLoader):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
+            # FIXME: here to avoid circular import
+            from text_generation_server.layers.fp8 import Fp8Weight
+
            if self.weight_class is not None and self.weight_class != Fp8Weight:
                raise RuntimeError(
                    f"Deserialized quantised fp8 weights but weight class is {self.weight_class}"