Fixing.

2025-09-11 12:24:53 +00:00 · 2024-02-23 15:10:08 +00:00 · 2024-02-23 15:10:08 +00:00 · a0095b5b8d
commit a0095b5b8d
parent f592df5234
4 changed files with 15 additions and 14 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -140,13 +140,17 @@ def get_model(
        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
-        medusa_config = hf_hub_download(
+        is_local = Path(medusa_model_id).exists()
-            medusa_model_id, revision=revision, filename="config.json"
+        if not is_local:
-        )
+            medusa_config = hf_hub_download(
-        hf_hub_download(
+                medusa_model_id, revision=revision, filename="config.json"
-            medusa_model_id, revision=revision, filename="medusa_lm_head.pt"
+            )
-        )
+            hf_hub_download(
-        use_medusa = Path(medusa_config).parent
+                medusa_model_id, revision=revision, filename="medusa_lm_head.pt"
            )
            use_medusa = Path(medusa_config).parent
        else:
            use_medusa = Path(medusa_model_id)
        method = "medusa"
    else:
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -592,7 +592,7 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        input_lengths: torch.Tensor,
        max_s: int,
        lm_head_indices: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
@ -605,5 +605,5 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
-        logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
-        return logits
+        return logits, speculative_logits
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -968,8 +968,6 @@ class FlashCausalLM(Model):
            speculative_logits,
        )
        logger.info(f"Accepted ids {accepted_ids}")
        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
        )
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -444,11 +444,10 @@ class SpeculativeHead(nn.Module):
            import json
            medusa_config = str(Path(use_medusa) / "config.json")
-            medusa_head = str(Path(use_medusa) / "medusa_lm_head.pt")
+            filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
            with open(medusa_config, "r") as f:
                config = json.load(f)
            filename = medusa_head[: -len(".pt")] + ".safetensors"
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():