From 7d96b1a10374ea1c1eb2114043753a9a251772ba Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 2 Jul 2024 14:45:35 +0200
Subject: [PATCH] More dead code.

---
 .../models/flash_cohere.py                    |  75 --------
 .../models/flash_llama.py                     | 171 ------------------
 .../models/flash_starcoder2.py                |  83 ---------
 3 files changed, 329 deletions(-)
 delete mode 100644 server/text_generation_server/models/flash_cohere.py
 delete mode 100644 server/text_generation_server/models/flash_llama.py
 delete mode 100644 server/text_generation_server/models/flash_starcoder2.py

diff --git a/server/text_generation_server/models/flash_cohere.py b/server/text_generation_server/models/flash_cohere.py
deleted file mode 100644
index 9f8bcb3f..00000000
--- a/server/text_generation_server/models/flash_cohere.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoTokenizer, AutoConfig
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
-    FlashCohereForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashCohere(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashCohere is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-            use_fast=True,
-            from_slow=False,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashCohereForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashCohere, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
deleted file mode 100644
index d996b9c3..00000000
--- a/server/text_generation_server/models/flash_llama.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import os
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoConfig, AutoTokenizer, GenerationConfig
-from typing import Optional, Tuple, Dict, List
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_llama_modeling import (
-    FlashLlamaForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-    hub,
-)
-
-tracer = trace.get_tracer(__name__)
-
-from text_generation_server.utils.import_utils import SYSTEM
-
-ADAPTER_LAYERS = [
-    "q_proj",
-    "k_proj",
-    "v_proj",
-    "o_proj",
-    "gate_proj",
-    "up_proj",
-    "down_proj",
-]
-ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
-
-
-class FlashLlama(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-        lora_adapter_ids: Optional[list] = [],
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashLlama is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id, revision=revision, trust_remote_code=trust_remote_code
-            )
-            if isinstance(generation_config.eos_token_id, (list, set)):
-                # TODO Huge hack
-                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
-        except Exception:
-            pass
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        prefix = ""
-        model = FlashLlamaForCausalLM(prefix, config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashLlama, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    @property
-    def supports_adapter_loading(self) -> bool:
-        return True
-
-    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
-        layer_weights = {}
-
-        prefix = "model.layers"
-
-        # This accounts for VLMs (e.g. LlavaNext, Idefics2)
-        # that have a language_model inside of the larger model.
-        if hasattr(self.model, "language_model"):
-            _model = self.model.language_model
-        elif hasattr(self.model, "text_model"):
-            _model = self.model.text_model
-        else:
-            _model = self.model
-
-        for i, layer in enumerate(_model.model.layers):
-            layer_weights[(i, "q_proj")] = (
-                f"{prefix}.{i}.self_attn.q_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "k_proj")] = (
-                f"{prefix}.{i}.self_attn.k_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "v_proj")] = (
-                f"{prefix}.{i}.self_attn.v_proj",
-                layer.self_attn.query_key_value,
-            )
-            layer_weights[(i, "o_proj")] = (
-                f"{prefix}.{i}.self_attn.o_proj",
-                layer.self_attn.o_proj,
-            )
-
-            layer_weights[(i, "gate_proj")] = (
-                f"{prefix}.{i}.mlp.gate_proj",
-                layer.mlp.gate_up_proj,
-            )
-            layer_weights[(i, "up_proj")] = (
-                f"{prefix}.{i}.mlp.up_proj",
-                layer.mlp.gate_up_proj,
-            )
-            layer_weights[(i, "down_proj")] = (
-                f"{prefix}.{i}.mlp.down_proj",
-                layer.mlp.down_proj,
-            )
-
-        layer_weights[(0, "lm_head")] = ("lm_head", _model.lm_head)
-        return layer_weights
-
-    @property
-    def adapter_layers(self) -> List[str]:
-        return ADAPTER_LAYERS
-
-    @property
-    def default_traced_adapter_layers(self) -> List[str]:
-        return ["q_proj", "v_proj"]
-
-    def get_num_layers_for_type(self, layer_type: str) -> int:
-        return 1 if layer_type == "lm_head" else len(self.model.model.layers)
-
-    def is_row_parallel(self, layer_type: str) -> bool:
-        return layer_type in ROW_PARALLEL
diff --git a/server/text_generation_server/models/flash_starcoder2.py b/server/text_generation_server/models/flash_starcoder2.py
deleted file mode 100644
index 16c9a8b9..00000000
--- a/server/text_generation_server/models/flash_starcoder2.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import math
-
-import torch
-
-from typing import Optional
-
-from transformers.models.gpt2 import GPT2TokenizerFast
-
-from text_generation_server.models.flash_mistral import (
-    FlashMistral,
-)
-from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
-    Starcoder2Config,
-    FlashStarcoder2ForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-# Starcoder2 has the same base as Mistral
-class FlashStarcoder2(FlashMistral):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashStarcoder2 is only available on GPU")
-
-        tokenizer = GPT2TokenizerFast.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = Starcoder2Config.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-
-        # Set context windows
-        if config.sliding_window is not None:
-            set_sliding_window(config.sliding_window)
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-
-        model = FlashStarcoder2ForCausalLM(config, weights)
-
-        self.cuda_graphs = {}
-
-        torch.distributed.barrier(group=self.process_group)
-        super(BaseFlashMistral, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-            sliding_window=config.sliding_window,
-        )