add transformers_flash

2025-09-11 12:24:53 +00:00 · 2024-12-10 16:46:55 +01:00 · 2024-12-10 16:46:55 +01:00 · ade0f44aca
commit ade0f44aca
parent 9f5c9a5e22
3 changed files with 341 additions and 1 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -20,6 +20,7 @@ from pathlib import Path
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM, CausalLMBatchKeysLast
+from text_generation_server.models.transformers_flash_causal_lm import TransformersFlashCausalLM
 from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
 from text_generation_server.models.custom_modeling.mpt_modeling import (
    MPTForCausalLM,
@ -28,7 +29,7 @@ from text_generation_server.models.bloom import BloomCausalLMBatch
 from text_generation_server.models.custom_modeling.bloom_modeling import (
    BloomForCausalLM,
 )
-from text_generation_server.models.globals import ATTENTION
+from text_generation_server.models.globals import ATTENTION, USE_CUSTOM_MODELING
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
 from text_generation_server.models.galactica import GalacticaCausalLMBatch
 from text_generation_server.models.custom_modeling.neox_modeling import (
@ -366,12 +367,38 @@ def get_model(
    max_input_tokens: int,
 ) -> Model:
    global FLASH_ATTENTION
+    global USE_CUSTOM_MODELING

    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
    model_type = config_dict.get("model_type", None)

+    transformers_causal_lm_class = CausalLM
+    if (
+        not USE_CUSTOM_MODELING
+        and model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+    ):
+        logger.info(
+            "TGI's flash enabled models could either not be loaded or are disabled, using Transformers fallback."
+        )
+        transformers_model_class = getattr(
+            transformers, modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]
+        )
+
+        if (
+            transformers_model_class._supports_flash_attn_2
+            and transformers_model_class._supports_cache_class
+        ):
+            logger.info(
+                f"Transformers' {model_type} implementation supports custom cache and flash/paged attention. Using TransformersFlashCausalLM with ragged tensors (single dimension for batch and sequence length)."
+            )
+            transformers_causal_lm_class = TransformersFlashCausalLM
+        else:
+            logger.info(
+                f"Transformers' {model_type} implementation supports custom cache and flash/paged attention. Using TransformersCausalLM with classic tensors with padding (two dimensions for batch size and sequence length)."
+            )
+
    quantization_config = config_dict.get("quantization_config", None)
    if quantization_config is None:
        quantization_config = config_dict.get("compression_config", None)
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -67,3 +67,7 @@ def set_adapter_to_index(adapter_to_index: Dict[str, int]):
 def get_adapter_to_index():
    global ADAPTER_TO_INDEX
    return ADAPTER_TO_INDEX
+
+
+USE_CUSTOM_MODELING = os.getenv("USE_CUSTOM_MODELING", "true")
+USE_CUSTOM_MODELING = USE_CUSTOM_MODELING == "true" or USE_CUSTOM_MODELING == "1"
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@ -0,0 +1,309 @@
+import math
+import sys
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from opentelemetry import trace
+from loguru import logger
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from text_generation_server.models.flash_causal_lm import (
+    FlashCausalLMBatch,
+    FlashCausalLM,
+)
+from text_generation_server.utils.import_utils import (
+    empty_cache,
+    synchronize,
+    get_free_memory,
+)
+from text_generation_server.adapters import AdapterBatchData
+from text_generation_server.layers.attention import paged_attention, attention, Seqlen
+from text_generation_server.layers.attention.kv_cache import KVScales
+from text_generation_server.models.globals import ATTENTION
+from text_generation_server.models.metadata_kernels import block_tables_to_ragged
+
+
+tracer = trace.get_tracer(__name__)
+
+
+def patch_everywhere(
+    attribute_name: str, patch: Any, module_name_prefix: Optional[str] = None
+):
+    """
+    Finds all occurences of `attribute_name` in the loaded modules and patches them with `patch`.
+
+    Args:
+        attribute_name (`str`):
+            The name of attribute to patch.
+        patch (`Any`):
+            The patch for the attribute.
+        module_name_prefix (`Optional[str]`, defaults to `None`):
+            If set, only module names starting with this prefix will be considered for patching.
+    """
+    # sys.modules may be updated while being iterated over, hence the list copy.
+    for name in list(sys.modules):
+        module = sys.modules[name]
+        if module_name_prefix is not None and not name.startswith(module_name_prefix):
+            continue
+        if hasattr(module, attribute_name):
+            setattr(module, attribute_name, patch)
+
+
+def _flash_attention_forward_patched(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    query_length: int,
+    is_causal: bool,
+    softmax_scale: Optional[float] = None,
+    sliding_window: int = -1,
+    softcap: Optional[float] = None,
+    **kwargs,
+):
+
+    kv_cache = kwargs["kv_cache"][kwargs["layer_idx"]]
+    # This means no scale
+    kv_scales=KVScales(torch.tensor(1., device=key_states.device), torch.tensor(1., device=key_states.device))
+
+    # Correctly reshape the states
+    _, _, num_heads, head_dim = query_states.size()
+    _, _, num_kv_heads, _ = key_states.size()
+    query_states = query_states.view(-1, num_heads, head_dim)
+    key_states = key_states.view(-1, num_kv_heads, head_dim)
+    value_states = value_states.view(-1, num_kv_heads, head_dim)
+
+    # Take care of updating the cache in-place
+    kv_cache.store(
+        key=key_states,
+        value=value_states,
+        slots=kwargs["slots"],
+        kv_scales=kv_scales
+    )
+
+    softmax_scale = 1 / math.sqrt(head_dim) if softmax_scale is None else softmax_scale
+
+    if kwargs["cu_seqlen_prefill"] is not None:
+        attn_output = attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            kv_cache=kv_cache,
+            kv_scales=kv_scales,
+            seqlen=kwargs["seqlen"],
+            block_tables=kwargs["block_tables"],
+            softmax_scale=softmax_scale,
+            window_size_left=sliding_window,
+            softcap=softcap,
+        )
+    else:
+        attn_output = paged_attention(
+            query_states,
+            kv_cache,
+            kwargs["kv_head_mapping"],
+            softmax_scale,
+            kwargs["block_tables"],
+            kwargs["seqlen"],
+            kwargs["max_s"],
+            kv_scales=kv_scales,
+            softcap=softcap,
+        )
+
+    attn_output = attn_output.view(attn_output.shape[0], -1)
+
+    return attn_output
+
+
+class TransformersFlashCausalLM(FlashCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
+        device_count = 0
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            device_count = torch.cuda.device_count()
+            dtype = torch.float16 if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+            device_count = torch.xpu.device_count()
+            dtype = torch.float16 if dtype is None else dtype
+        else:
+            if quantize:
+                raise ValueError("quantization is not available on CPU")
+
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            device_map=("auto" if device_count > 1 else None),
+            load_in_8bit=quantize == "bitsandbytes",
+            trust_remote_code=trust_remote_code,
+        )
+        if device_count == 1 and quantize != "bitsandbytes":
+            model = model.to(device)
+
+        if tokenizer.pad_token_id is None:
+            if model.config.pad_token_id is not None:
+                tokenizer.pad_token_id = model.config.pad_token_id
+            elif model.config.eos_token_id is not None and isinstance(
+                model.config.eos_token_id, int
+            ):
+                tokenizer.pad_token_id = model.config.eos_token_id
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+
+        self.num_layers = len(model.model.layers)
+        self.num_kv_heads = model.config.num_key_value_heads
+        self.head_size = model.config.hidden_size // model.config.num_attention_heads
+
+        # Skip FlashCausalLM init.
+        super(FlashCausalLM, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+        )
+
+    def warmup(self, batch: FlashCausalLMBatch):
+        patch_everywhere("_flash_attention_forward", _flash_attention_forward_patched)
+        super().warmup(batch)
+
+    def forward(
+        self, batch: FlashCausalLMBatch, adapter_data: AdapterBatchData
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # NOTE: adapter_data: not supported
+
+        input_ids = batch.input_ids
+        position_ids = batch.position_ids
+        cu_seqlen_prefill = batch.cu_seqlen_prefill
+        kv_cache = self.kv_cache
+        block_tables = batch.block_tables_tensor
+        slots = batch.slots[batch.slot_indices]
+        input_lengths = batch.input_lengths_tensor
+        cache_lengths_tensor = batch.cache_lengths_tensor
+        max_s = batch.max_current_length
+        lm_head_indices = batch.prefill_head_indices
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        bs = input_ids.shape[0]
+        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
+        if sorted_padded_bs:
+            # Get associated cuda graph
+            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
+        else:
+            cuda_graph = None
+
+        if cu_seqlen_prefill is not None or cuda_graph is None:
+            if ATTENTION == "flashinfer":
+                block_tables = block_tables_to_ragged(
+                    block_tables=block_tables,
+                    input_lengths=batch.input_lengths,
+                    cache_lengths=batch.cache_lengths,
+                    input_lengths_tensor=batch.input_lengths_tensor,
+                    cache_lengths_tensor=batch.cache_lengths_tensor,
+                    max_current_length=batch.max_current_length,
+                )
+            with self._forward_context(
+                block_tables=block_tables,
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                input_lengths_tensor=input_lengths,
+                cache_lengths_tensor=cache_lengths_tensor,
+            ):
+                seqlen = Seqlen(
+                    input_lengths=input_lengths,
+                    cache_lengths=cache_lengths_tensor,
+                    cu_seqlen_q=cu_seqlen_prefill,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
+                )
+                logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    past_key_values=None,
+                    use_cache=False,  # we use self.kv_cache instead of transformers cache object
+                    cu_seqlen_prefill=cu_seqlen_prefill,
+                    kv_cache=kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=batch.prefill_cache_indices,
+                    lm_head_indices=lm_head_indices,
+                )
+                if batch.prefill_cache_indices is not None:
+                    batch.prefill_cache_indices = None
+                return logits, None
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
+        cuda_graph["position_ids"][: position_ids.shape[-1]] = position_ids
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=batch.input_lengths,
+                cache_lengths=batch.cache_lengths,
+                input_lengths_tensor=batch.input_lengths_tensor,
+                cache_lengths_tensor=batch.cache_lengths_tensor,
+                max_current_length=batch.max_current_length,
+            )
+            # assert block_tables.shape[0] >= slots.shape[0]
+            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
+        else:
+            cuda_graph["block_tables"][
+                : block_tables.shape[0], : block_tables.shape[1]
+            ] = block_tables
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
+        cuda_graph["slots"].fill_(0)
+        cuda_graph["slots"][: slots.shape[0]] = slots
+        cuda_graph["input_lengths"].zero_()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
+
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
+
+        # Slice output to the correct shape
+        logits = cuda_graph["logits"][:bs]
+        return logits, None