Tmp work for medusa.

2025-07-05 15:30:19 +00:00 · 2023-09-11 22:12:19 +00:00 · 2023-09-11 22:12:19 +00:00 · 94a0bf1bbc
commit 94a0bf1bbc
parent 3c71c656c7
4 changed files with 86 additions and 1 deletions
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -116,7 +116,7 @@ def download_weights(
        logger.info("Files are already present on the host. " "Skipping download.")
        return
    # Local files not found
-    except (utils.LocalEntryNotFoundError, FileNotFoundError):
+    except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
        pass
    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
@ -137,6 +137,29 @@ def download_weights(
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
        try:
            import json
            medusa_head = hf_hub_download(model_id, revision=revision, filename="medusa_lm_head.pt")
            if auto_convert:
                medusa_sf = Path(medusa_head[:-len(".pt")] + ".safetensors")
                if not medusa_sf.exists():
                    utils.convert_files([Path(medusa_head)], [medusa_sf], [])
            medusa_config = hf_hub_download(model_id, revision=revision, filename="config.json")
            with open(medusa_config, "r") as f:
                config = json.load(f)
            model_id = config["base_model_name_or_path"]
            revision = "main"
            try:
                utils.weight_files(model_id, revision, extension)
                logger.info(f"Files for parent {model_id} are already present on the host. " "Skipping download.")
                return
            # Local files not found
            except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
                pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
        # Try to download weights from the hub
        try:
            filenames = utils.weight_hub_files(model_id, revision, extension)
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -129,6 +129,17 @@ def get_model(
    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
    use_medusa = None
    if "medusa_num_heads" in config_dict:
        use_medusa = model_id
        medusa_config = config_dict
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
    model_type = config_dict["model_type"]
    if model_type == "gpt_bigcode":
@ -204,6 +215,7 @@ def get_model(
                quantize=quantize,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
                use_medusa=use_medusa
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama"))
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@ -28,6 +28,7 @@ class FlashLlama(FlashCausalLM):
        quantize: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
        use_medusa: Optional[str] = None,
    ):
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
@ -66,6 +67,17 @@ class FlashLlama(FlashCausalLM):
            weights._set_gptq_params(model_id)
        model = FlashLlamaForCausalLM(config, weights)
        if use_medusa:
            from text_generation_server.utils.medusa import MedusaModel
            from huggingface_hub import hf_hub_download
            import json
            medusa_config = hf_hub_download(use_medusa, revision=revision, filename="config.json")
            with open(medusa_config, "r") as f:
                config = json.load(f)
            medusa_head = hf_hub_download(use_medusa, revision=revision, filename="medusa_lm_head.pt")
            medusa_sf = medusa_head[:-len(".pt")] + ".safetensors"
            weights = Weights([medusa_sf], device, dtype, process_group=self.process_group)
            model.lm_head = MedusaModel(config, weights)
        torch.distributed.barrier(group=self.process_group)
        super(FlashLlama, self).__init__(
--- a/server/text_generation_server/utils/medusa.py
+++ b/server/text_generation_server/utils/medusa.py
@ -0,0 +1,38 @@
 import torch
 from text_generation_server.utils.layers import TensorParallelHead, FastLinear
 class ResBlock(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.linear = FastLinear.load(config, prefix=f"{prefix}.linear", weights=weights, bias=True)
        self.act = torch.nn.SiLU()
    def forward(self, x):
        return x + self.act(self.linear(x))
 class MedusaModel(torch.nn.Module):
    def __init__(
        self,
        config,
        weights
    ):
        super().__init__()
        self.heads = torch.nn.ModuleList(
            [MedusaHead(config, prefix=f"{i}", weights=weights) for i in range(config["medusa_num_heads"])]
        )
 class MedusaHead(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.blocks = torch.nn.ModuleList([ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) for i in range(config["medusa_num_layers"])])
        n = len(self.blocks)
        self.out = FastLinear.load(config, prefix=f"{prefix}.{n}", weights=weights, bias=False)
    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        x = self.out(x)
        return x