add medusa

2025-09-11 12:24:53 +00:00 · 2024-02-28 11:02:39 +01:00 · 2024-02-28 11:02:39 +01:00 · c84223590b
commit c84223590b
parent a56bd736e6
2 changed files with 5 additions and 3 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@ -32,7 +32,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
    FastRMSNorm,
    FastLayerNorm,
@ -486,13 +486,13 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):

        self.model = Starcoder2Model(config, weights)
        try:
-            self.lm_head = TensorParallelHead.load(
+            self.lm_head = SpeculativeHead.load(
                config,
                prefix="lm_head",
                weights=weights,
            )
        except RuntimeError:
-            self.lm_head = TensorParallelHead.load(
+            self.lm_head = SpeculativeHead.load(
                config,
                prefix="model.embed_tokens",
                weights=weights,
--- a/server/text_generation_server/models/flash_starcoder2.py
+++ b/server/text_generation_server/models/flash_starcoder2.py
@ -29,6 +29,7 @@ class FlashStarcoder2(BaseFlashMistral):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@ -51,6 +52,7 @@ class FlashStarcoder2(BaseFlashMistral):
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        # Set context windows
        if config.sliding_window is not None: