feat: prefer custom model and produce correct output

2025-09-17 23:34:52 +00:00 · 2024-01-25 17:07:37 -05:00 · 2024-01-25 17:07:37 -05:00 · 1c32d53fc3
commit 1c32d53fc3
parent 35939a28c7
2 changed files with 266 additions and 61 deletions
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@ -17,6 +17,7 @@ class MambaConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=50280,
        d_model=768,
        n_layer=32,
        layer_norm_epsilon=1e-5,
        tie_word_embeddings=False,
@ -28,6 +29,9 @@ class MambaConfig(PretrainedConfig):
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.layer_norm_epsilon = layer_norm_epsilon
        self.d_model = d_model
        self.d_inner = d_model * 2
        self.d_conv = 4
        super().__init__(
            pad_token_id=pad_token_id,
@ -43,19 +47,17 @@ class MambaBlock(nn.Module):
        super().__init__()
        # TODO: use model config to set the dt_rank instead of hardcoding it
-        d_inner = 768 * 2
+        self.dt_rank = (config.d_model + 15) // 16
        d_conv = 4
        self.dt_rank = (768 + 15) // 16
        # TODO: improve how we load the conv1d weights
        # explore a transposed conv1d that avoids the need for
        # a transpose during inference
        self.conv1 = nn.Conv1d(
-            d_inner,
+            config.d_inner,
-            d_inner,
+            config.d_inner,
-            kernel_size=d_conv,
+            kernel_size=config.d_conv,
-            groups=d_inner,
+            groups=config.d_inner,
-            padding=d_conv - 1,
+            padding=config.d_conv - 1,
        )
        self.conv1.weight = nn.Parameter(weights.get_tensor(f"{prefix}.conv1d.weight"))
        self.conv1.bias = nn.Parameter(weights.get_tensor(f"{prefix}.conv1d.bias"))
@ -148,12 +150,33 @@ class MambaBlock(nn.Module):
        )
        return selective_scan_output
-    def forward(self, hidden_states):
+    def forward(self, index, hidden_states, past_transformed_state):
        sequence_length = hidden_states.shape[1]
        # minimal amount of new work on single hidden state (previous hidden state are cached)
        only_last = hidden_states[:, -1, :]
        projected_only_last = self.in_proj(only_last)
        transformed_only_last, residual_only_last = torch.chunk(
            projected_only_last, 2, dim=-1
        )
        if past_transformed_state is not None:
            # build a new transformed_states tensor with past_transformed_state and transformed_only_last
            new_transformed_states = torch.cat(
                [past_transformed_state, transformed_only_last.unsqueeze(1)], dim=1
            )
            transformed_states = new_transformed_states
            residual_states = residual_only_last
        else:
            # prefilling the cache with the last transformed state
            projected_states = self.in_proj(hidden_states)
            split_states = torch.chunk(projected_states, 2, dim=-1)
            transformed_states, residual_states = split_states
        # NOTE: we need the past hidden states to produce the correct output
        # therefore we cannot simply compute the most recent and append it as we
        # did for the transformed states
        # TODO: avoid the transpose by using a transposed conv1d
        # apply convolution and narrowing operation
        conv_output = (
@ -170,7 +193,8 @@ class MambaBlock(nn.Module):
        output = self.ssm(activated_transformed)
        combined_output = output * activated_residual
-        return self.out_proj(combined_output)
+        return self.out_proj(combined_output), transformed_states
 # TODO: prefer a more optimized implementation of RMSNorm if possible
 class RMSNorm(nn.Module):
@ -193,20 +217,24 @@ class ResidualBlock(nn.Module):
        self.mamba_block = MambaBlock(
            prefix=f"{layer_id}.mixer", config=config, weights=weights
        )
-        self.layer_norm = RMSNorm(768, eps=config.layer_norm_epsilon)
+        self.layer_norm = RMSNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.layer_norm.scale = nn.Parameter(
            weights.get_tensor(f"{layer_id}.norm.weight")
        )
    def forward(
        self,
        index,
        hidden_states,
        past_transformed_state,
    ):
        residual = hidden_states
        hidden_states = self.layer_norm(hidden_states)
-        attn_outputs = self.mamba_block(hidden_states)
+        attn_outputs, transformed_states = self.mamba_block(
            index, hidden_states, past_transformed_state
        )
        hidden_states = residual + attn_outputs
-        return hidden_states
+        return hidden_states, transformed_states
 class MambaModel(nn.Module):
@ -225,10 +253,10 @@ class MambaModel(nn.Module):
        )
        # TODO: avoid hardcoded sizes and improve how we load the weights
-        self.norm_f = RMSNorm(768, eps=config.layer_norm_epsilon)
+        self.norm_f = RMSNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.norm_f.scale = nn.Parameter(weights.get_tensor(f"backbone.norm_f.weight"))
        # use the same weights for the embedding and the final layer norm
-        self.lm_head = nn.Linear(768, config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.lm_head.weight = nn.Parameter(
            self.embed_tokens.weight[: config.vocab_size, :]
        )
@ -237,50 +265,26 @@ class MambaModel(nn.Module):
        self,
        input_ids: torch.LongTensor,
        past_input_ids: Optional[List[Tuple[torch.FloatTensor]]] = None,
        past_transformed_states: Optional[List[Tuple[torch.FloatTensor]]] = None,
    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
-        # TODO: dont use past_input_ids for the input_ids
+        # NOTE: we need all input_ids to compute the correct embeddings
        # find a way to cache previous states/work
        if past_input_ids is not None:
-            # append the contents to the input_ids
+            input_ids = past_input_ids
            input_ids = torch.cat((past_input_ids, input_ids), dim=1)
        hidden_states = self.embed_tokens(input_ids)
-        for _, block in enumerate(self.blocks):
+        past_transformed_states = (
-            hidden_states = block(hidden_states)
+            [None] * len(self.blocks)
            if past_transformed_states is None
            else past_transformed_states
        )
        for index, block in enumerate(self.blocks):
            hidden_states, transformed_states = block(
                index, hidden_states, past_transformed_states[index]
            )
            past_transformed_states[index] = transformed_states
        final_hidden_states = self.norm_f(hidden_states)
        after_lm_head = self.lm_head(final_hidden_states)
-        return after_lm_head, input_ids
+        return after_lm_head, input_ids, past_transformed_states
 # TODO: revisit if we want to use CausalLM
 class MambaForCausalLM(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.model = MambaModel(config, weights)
    def forward(
        self,
        input_ids: torch.LongTensor,
        # TODO: dont abuse past_key_values for the input_ids
        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
        # below are unused since this model is attention free
        attention_mask: Optional[torch.ByteTensor] = None,
        return_dict: Optional[bool] = None,
        use_cache: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
        model_output = self.model(
            input_ids,
            past_input_ids=past_key_values,
        )
        logits = model_output[0]
        past_hidden_states = model_output[1]
        return CausalLMOutputWithPast(
            loss=None,
            logits=logits,
            past_key_values=past_hidden_states,
            hidden_states=None,
            attentions=None,
        )
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@ -8,7 +8,6 @@ from text_generation_server.models import CausalLM
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.models.custom_modeling.mamba_modeling import (
    MambaConfig,
    MambaForCausalLM,
 )
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import (
@ -17,8 +16,27 @@ from text_generation_server.utils import (
    Weights,
 )
 import time
 from text_generation_server.models.custom_modeling.mamba_modeling import MambaModel
 from text_generation_server.models import Model
 from typing import Any, List, Optional, Tuple, Type
 from text_generation_server.models.types import (
    Batch,
    Tokens,
    Generation,
    GeneratedText,
 )
 from text_generation_server.utils.tokens import batch_top_tokens, Sampling
 class MambaCausalLMBatch(CausalLMBatch):
    past_transformed_states: Optional[List[torch.Tensor]]
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.past_input_ids = None
        self.past_transformed_states = None
    @classmethod
    def from_pb(
        cls,
@ -32,7 +50,7 @@ class MambaCausalLMBatch(CausalLMBatch):
        return batch
-class Mamba(CausalLM):
+class Mamba(Model):
    def __init__(
        self,
        model_id: str,
@ -71,12 +89,195 @@ class Mamba(CausalLM):
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        model = MambaForCausalLM(config, weights)
+        model = MambaModel(config, weights)
        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
+        super(Mamba, self).__init__(
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return MambaCausalLMBatch
    def forward(
        self,
        input_ids: torch.Tensor,
        past: Optional[List[torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.model(
            input_ids,
            past=past,
        )
    def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tuple[int, int]]:
        start = time.time_ns()
        input_ids = batch.input_ids
        past_input_ids = batch.past_input_ids
        past_transformed_states = batch.past_transformed_states
        model_output = self.model(
            input_ids,
            past_input_ids,
            past_transformed_states,
        )
        logits = model_output[0]
        past_input_ids = model_output[1]
        past_transformed_states = model_output[2]
        # Results
        generations: List[Generation] = []
        stopped = True
        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens,
            batch.top_n_tokens_tensor,
            torch.log_softmax(logits[:, -1], -1),
        )
        start_decode = time.time_ns()
        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_input_ids,
            batch.top_n_tokens,
            batch_top_token_ids,
            batch_top_token_logprobs,
        )
        # For each member of the batch
        for i, (
            request,
            input_length,
            prefix_offset,
            read_offset,
            logits,
            next_token_chooser,
            stopping_criteria,
            all_input_ids,
            top_n_tokens,
            top_token_ids,
            top_token_logprobs,
        ) in enumerate(iterator):
            # Select next token
            next_token_id, logprobs = next_token_chooser(
                all_input_ids.view(1, -1), logits[-1:, :]
            )
            # add next token to past_input_ids
            past_input_ids = torch.cat([past_input_ids, next_token_id], dim=1)
            # Append next token to all tokens
            all_input_ids = torch.cat([all_input_ids, next_token_id])
            new_input_length = input_length + 1
            # Generated token
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text, prefix_offset, read_offset = self.decode_token(
                all_input_ids[:, 0], prefix_offset, read_offset
            )
            # Evaluate stopping criteria
            stop, reason = stopping_criteria(
                next_token_id_squeezed,
                next_token_text,
            )
            if not stop:
                stopped = False
            if stop:
                # Decode generated tokens
                output_text, _, _ = self.decode_token(
                    all_input_ids[:, 0],
                    prefix_offset=len(all_input_ids)
                    - stopping_criteria.current_tokens
                    - 1,
                    read_offset=len(all_input_ids) - stopping_criteria.current_tokens,
                    skip_special_tokens=True,
                )
                # Get seed
                if isinstance(next_token_chooser.choice, Sampling):
                    seed = next_token_chooser.choice.seed
                else:
                    seed = None
                generated_text = GeneratedText(
                    output_text, stopping_criteria.current_tokens, reason, seed
                )
            else:
                generated_text = None
            # Prefill
            if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                # Remove generated token to only have prefill and add nan for first prompt token
                prefill_logprobs = [float("nan")] + torch.log_softmax(
                    logits, -1
                ).gather(1, all_input_ids[1:]).squeeze(1)[-new_input_length:-1].tolist()
                prefill_token_ids = all_input_ids[-new_input_length:-1]
                prefill_texts = self.tokenizer.batch_decode(
                    prefill_token_ids,
                    clean_up_tokenization_spaces=False,
                    skip_special_tokens=False,
                )
                prefill_tokens = Tokens(
                    prefill_token_ids,
                    prefill_logprobs,
                    prefill_texts,
                    is_special=[],
                )
            else:
                prefill_tokens = None
            generation = Generation(
                batch.batch_id,
                None,
                Tokens(
                    [next_token_id_squeezed],
                    [next_token_logprob],
                    [next_token_text],
                    [next_token_id_squeezed.item() in self.all_special_ids],
                ),
                generated_text,
                None,
            )
            generations.append(generation)
            # Update values
            batch.input_ids = torch.cat(
                [batch.input_ids, torch.tensor([[next_token_id_squeezed]])], dim=1
            )
            batch.all_input_ids[i] = all_input_ids
            batch.input_lengths[i] = new_input_length
            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.max_input_length = max(batch.max_input_length, new_input_length)
        # We finished all generations in the batch; there is no next batch
        if stopped:
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)
        # Slice unused values from prefill
        batch.input_ids = batch.input_ids[:, :1]
        batch.past_input_ids = past_input_ids
        batch.past_transformed_states = past_transformed_states
        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)