Fixes and format

2025-09-10 11:54:52 +00:00 · 2023-05-25 15:08:52 -04:00 · 2023-05-25 15:08:52 -04:00 · a515fbde4c
commit a515fbde4c
parent 0921fe6a2a
6 changed files with 683 additions and 283 deletions
--- a/server/text_generation_server/models/custom_modeling/gpt_bigcode2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/gpt_bigcode2_modeling.py
@ -13,14 +13,13 @@
 # limitations under the License.
 """PyTorch GPTBigCode model."""
 import math
-from typing import Optional, Tuple, Any, Union, List
-from enum import IntEnum
+from typing import Optional, Tuple, Any, List

 import torch
 import torch.utils.checkpoint
 from torch import nn

-from dropout_layer_norm import dropout_layer_norm
+from dropout_layer_norm import dropout_add_ln_fwd
 from flash_attn.bert_padding import pad_input, unpad_input
 from flash_attn.flash_attn_interface import flash_attn_unpadded_func

@ -30,10 +29,11 @@ from transformers.models.gpt_bigcode.configuration_gpt_bigcode import (
    GPTBigCodeConfig,
 )

+
 class FastLayerNorm(nn.LayerNorm):
    # TODO: Validate dimension
    def forward(self, hidden_states, residual=None):
-        return dropout_layer_norm.dropout_add_ln_fwd(
+        out, residual, *_ = dropout_add_ln_fwd(
            hidden_states,
            residual,
            self.weight,
@ -50,18 +50,24 @@ class FastLayerNorm(nn.LayerNorm):
            False,
            False,
        )
+        if residual is None:
+            residual = hidden_states
+        return out, residual


 class FastLinear(nn.Linear):
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.addmm(self.bias, input, self.weight)

+
 class FastLinearNoBias(nn.Linear):
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.mm(input, self.weight)

+
 logger = logging.get_logger(__name__)

+
@torch.jit.script
 def upcast_masked_softmax(
    x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float
@ -72,38 +78,53 @@ def upcast_masked_softmax(
    x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
    return x

+
@torch.jit.script
 def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
    x = torch.where(mask, x, mask_value)
    x = torch.nn.functional.softmax(x, dim=-1)
    return x

+
 class GPTBigCodeAttention(nn.Module):
-    def __init__(self, config:GPTBigCodeConfig, layer_idx:int, dtype:torch.dtype):
+    mask_value: torch.Tensor
+
+    def __init__(self, config: GPTBigCodeConfig, layer_idx: int, dtype: torch.dtype):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.layer_idx = layer_idx
-        # Note: Does not support module dtype conversion.
-        self.register_buffer("mask_value", torch.empty((), dtype=torch.float32, device="meta"))

-        self.c_attn = FastLinear(self.embed_dim, self.embed_dim + 2 * self.head_dim, dtype=dtype, device="meta")
-        self.c_proj = FastLinear(self.embed_dim, self.embed_dim, dtype=dtype, device="meta")
+        self.c_attn = FastLinear(
+            self.embed_dim,
+            self.embed_dim + 2 * self.head_dim,
+            dtype=dtype,
+            device="meta",
+        )
+        self.c_proj = FastLinear(
+            self.embed_dim, self.embed_dim, dtype=dtype, device="meta"
+        )

    def prefill(
        self,
        hidden_states: torch.Tensor,
        sequence_lengths,
-        key_length:int,
+        key_length: int,
    ) -> Tuple[torch.Tensor, Any]:
        hidden_shape = hidden_states.shape
-        query, key_value = self.c_attn.forward(hidden_states).split((self.embed_dim, 2 * self.head_dim), dim=-1)
+        query, key_value = self.c_attn.forward(hidden_states).split(
+            (self.embed_dim, 2 * self.head_dim), dim=-1
+        )
        query = query.view(hidden_shape[0], self.num_heads, self.head_dim)
-        key, value = key_value.unsqueeze(1).expand(hidden_shape[0], self.num_heads, 2*self.head_dim).split((self.head_dim, self.head_dim), dim=-1)
+        key, value = (
+            key_value.unsqueeze(1)
+            .expand(hidden_shape[0], self.num_heads, 2 * self.head_dim)
+            .split((self.head_dim, self.head_dim), dim=-1)
+        )

        # attn_output: (sum_seq_len, num_heads * head_dim)
-        attn_output = flash_attn_unpadded_func(
+        hidden_states = flash_attn_unpadded_func(
            query,
            key,
            value,
@ -116,23 +137,25 @@ class GPTBigCodeAttention(nn.Module):
            causal=True,
        ).view(hidden_shape)

-        attn_output = self.c_proj.forward(attn_output)
+        hidden_states = self.c_proj.forward(hidden_states)

-        return attn_output, key_value
+        return hidden_states, key_value

    def decode(
        self,
        hidden_states: torch.Tensor,
        layer_past: torch.Tensor,
        attention_mask: torch.Tensor,
-        batch_size:int,
-        key_length:int,
+        batch_size: int,
+        key_length: int,
    ) -> Tuple[torch.Tensor, Any]:
-        query, key_value = self.c_attn.forward(hidden_states).split((self.embed_dim, 2 * self.head_dim), dim=-1)
+        query, key_value = self.c_attn.forward(hidden_states).split(
+            (self.embed_dim, 2 * self.head_dim), dim=-1
+        )

        # Calculate dimensions and recover layer_past
        padded_key_length = attention_mask.size(-1)
-        allocated_key_length=layer_past.size(-2)
+        allocated_key_length = layer_past.size(-2)

        # TODO: Allow pre-allocation with size > padded_key_length
        if padded_key_length > allocated_key_length:
@ -142,37 +165,45 @@ class GPTBigCodeAttention(nn.Module):
                dtype=key_value.dtype,
                device=key_value.device,
            )
-            allocated_kv_cache[:, :key_length-1].copy_(layer_past[:, :key_length-1])
+            allocated_kv_cache[:, : key_length - 1].copy_(
+                layer_past[:, : key_length - 1]
+            )
            # Nans in `value` can propagate through the matrix multiplication,
            # so we set the remaining values to zero. (`last_key_length:key_length` is set below.)
            allocated_kv_cache[:, allocated_key_length:, self.head_dim :].zero_()
-            layer_past=allocated_kv_cache
+            layer_past = allocated_kv_cache

        # Copy the new values.
-        layer_past[:, key_length-1:key_length].copy_(key_value)
+        layer_past[:, key_length - 1].copy_(key_value)

        key, value = layer_past.split((self.head_dim, self.head_dim), dim=-1)

        # TODO: Upcasting needed for bf16?
        upcast = query.dtype != torch.float32
        unscale = self.layer_idx + 1 if upcast else 1
-        scale_factor = unscale ** -1 / self.head_dim ** 0.5
+        scale_factor = unscale**-1 / self.head_dim**0.5

+        # TODO: No need to unsqueeze?
        hidden_states = torch.baddbmm(
-            torch.empty((batch_size, self.num_heads, padded_key_length), device=query.device, dtype=query.dtype),
+            torch.empty(
+                (batch_size, self.num_heads, padded_key_length),
+                device=query.device,
+                dtype=query.dtype,
+            ),
            query.view(batch_size, self.num_heads, self.head_dim),
            key.transpose(-1, -2),
            beta=0,
-            alpha=scale_factor
+            alpha=scale_factor,
        ).unsqueeze_(1)

-        if self.mask_value is None or self.mask_value.device != hidden_states.device:
-            self.mask_value = torch.full([], torch.finfo(torch.float32).min, dtype=torch.float32, device=hidden_states.device)
-
        if upcast:
-            hidden_states = upcast_masked_softmax(hidden_states, attention_mask, self.mask_value, unscale)
+            hidden_states = upcast_masked_softmax(
+                hidden_states, attention_mask, self.mask_value, unscale
+            )
        else:
-            hidden_states = masked_softmax(hidden_states, attention_mask, self.mask_value)
+            hidden_states = masked_softmax(
+                hidden_states, attention_mask, self.mask_value
+            )

        hidden_states = torch.bmm(hidden_states.squeeze_(1), value).view(query.shape)

@ -180,21 +211,33 @@ class GPTBigCodeAttention(nn.Module):

        return hidden_states, layer_past

+
 class GPTBigCodeMLP(nn.Module):
    # TODO: Merge into GPTBigCodeBlock (needs renaming in state dict)
-    def __init__(self, config:GPTBigCodeConfig, dtype:torch.dtype):
+    def __init__(self, config: GPTBigCodeConfig, dtype: torch.dtype):
        super().__init__()
        embed_dim = config.hidden_size
        inner_dim = config.n_inner if config.n_inner is not None else 4 * embed_dim
        self.c_fc = FastLinear(embed_dim, inner_dim, dtype=dtype, device="meta")
        self.c_proj = FastLinear(inner_dim, embed_dim, dtype=dtype, device="meta")

+
 class GPTBigCodeBlock(nn.Module):
-    def __init__(self, config:GPTBigCodeConfig, layer_idx:int, dtype:torch.dtype):
+    def __init__(self, config: GPTBigCodeConfig, layer_idx: int, dtype: torch.dtype):
        super().__init__()
-        self.ln_1 = FastLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, dtype=dtype, device="meta")
+        self.ln_1 = FastLayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_epsilon,
+            dtype=dtype,
+            device="meta",
+        )
        self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx, dtype=dtype)
-        self.ln_2 = FastLayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, dtype=dtype, device="meta")
+        self.ln_2 = FastLayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_epsilon,
+            dtype=dtype,
+            device="meta",
+        )
        self.mlp = GPTBigCodeMLP(config, dtype=dtype)

    def prefill(
@ -202,7 +245,7 @@ class GPTBigCodeBlock(nn.Module):
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
        sequence_lengths,
-        key_length:int,
+        key_length: int,
    ) -> Tuple[torch.Tensor, torch.Tensor, Any]:
        hidden_states, residual, *_ = self.ln_1.forward(hidden_states, residual)
        hidden_states, present = self.attn.prefill(
@ -211,7 +254,9 @@ class GPTBigCodeBlock(nn.Module):
            key_length=key_length,
        )
        hidden_states, residual, *_ = self.ln_2.forward(hidden_states, residual)
-        hidden_states = self.mlp.c_proj.forward(nn.functional.gelu(self.mlp.c_fc.forward(hidden_states), approximate="tanh"))
+        hidden_states = self.mlp.c_proj.forward(
+            nn.functional.gelu(self.mlp.c_fc.forward(hidden_states), approximate="tanh")
+        )
        return hidden_states, residual, present

    def decode(
@ -220,8 +265,8 @@ class GPTBigCodeBlock(nn.Module):
        residual: Optional[torch.Tensor],
        layer_past: torch.Tensor,
        attention_mask: torch.Tensor,
-        batch_size:int,
-        key_length:int,
+        batch_size: int,
+        key_length: int,
    ) -> Tuple[torch.Tensor, torch.Tensor, Any]:
        hidden_states, residual, *_ = self.ln_1.forward(hidden_states, residual)
        hidden_states, present = self.attn.decode(
@ -232,7 +277,9 @@ class GPTBigCodeBlock(nn.Module):
            key_length=key_length,
        )
        hidden_states, residual, *_ = self.ln_2.forward(hidden_states, residual)
-        hidden_states = self.mlp.c_proj.forward(nn.functional.gelu(self.mlp.c_fc.forward(hidden_states), approximate="tanh"))
+        hidden_states = self.mlp.c_proj.forward(
+            nn.functional.gelu(self.mlp.c_fc.forward(hidden_states), approximate="tanh")
+        )
        return hidden_states, residual, present


@ -252,11 +299,7 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):

    def _init_weights(self, module):
        """Initialize the weights."""
-        if isinstance(module, GPTBigCodeModel):
-            module.bias.fill_(True).tril_()
-        elif isinstance(module, (GPTBigCodeBlock, GPTBigCodeAttention)):
-            if isinstance(module, GPTBigCodeAttention):
-                module.mask_value.fill_(torch.finfo(torch.float32).min)
+        if isinstance(module, (GPTBigCodeMLP, GPTBigCodeAttention)):
            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
@ -264,7 +307,10 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
            #
            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
            module.c_proj.weight.data.normal_(
-                mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
+                mean=0.0,
+                std=(
+                    self.config.initializer_range / math.sqrt(2 * self.config.n_layer)
+                ),
            )
            module.c_proj._is_hf_initialized = True
        elif isinstance(module, nn.Linear):
@ -284,62 +330,86 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):

 class GPTBigCodeModel(GPTBigCodePreTrainedModel):
    # TODO: Merge into GPTBigCodeForCausalLM (needs renaming in state dict)
-    def __init__(self, config:GPTBigCodeConfig, dtype:torch.dtype):
+    def __init__(self, config: GPTBigCodeConfig, dtype: torch.dtype):
        super().__init__(config)

-        self.wte = nn.Embedding(config.vocab_size, config.hidden_size, dtype=dtype, device="meta")
-        self.wpe = nn.Embedding(config.max_position_embeddings, config.hidden_size, dtype=dtype, device="meta")
-
-        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i, dtype=dtype) for i in range(config.num_hidden_layers)])
-        self.ln_f = FastLayerNorm(config.hidden_size, dtype=dtype, device="meta", eps=config.layer_norm_epsilon)
-
-        # Causal mask
-        self.register_buffer(
-            "causal_mask", torch.empty((config.max_position_embeddings, config.max_position_embeddings), dtype=torch.bool, device="meta")
+        self.wte = nn.Embedding(
+            config.vocab_size, config.hidden_size, dtype=dtype, device="meta"
+        )
+        self.wpe = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            dtype=dtype,
+            device="meta",
        )

-class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
-    pad_key_length_to_multiple=8
+        self.h = nn.ModuleList(
+            [
+                GPTBigCodeBlock(config, layer_idx=i, dtype=dtype)
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = FastLayerNorm(
+            config.hidden_size,
+            dtype=dtype,
+            device="meta",
+            eps=config.layer_norm_epsilon,
+        )

-    def __init__(self, config, dtype:torch.dtype, device:torch.device=torch.device("cuda")):
+
+class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
+    def __init__(
+        self, config, dtype: torch.dtype, device: torch.device = torch.device("cuda")
+    ):
        super().__init__(config)
-        if device.type!="cuda":
+        if device.type != "cuda":
            raise NotImplementedError(f"Device {device} not supported")

        self.transformer = GPTBigCodeModel(config, dtype=dtype)
-        self.lm_head = FastLinearNoBias(config.n_embd, config.vocab_size, bias=False, dtype=dtype, device="meta")
+        self.lm_head = FastLinearNoBias(
+            config.n_embd, config.vocab_size, bias=False, dtype=dtype, device="meta"
+        )
+
+        # Causal mask
+        self.causal_mask = torch.ones(
+            (config.max_position_embeddings, config.max_position_embeddings),
+            dtype=torch.bool,
+            device=device,
+        ).tril_()
+        self.mask_value = torch.full(
+            (), torch.finfo(torch.float32).min, dtype=torch.float32, device=device
+        )

        self.to_empty(device=device)

-        self._apply=self._apply_not_allowed
-
        # Initialize weights and apply final processing
+        # TODO: Skip?
        self.post_init()

-    def _apply_not_allowed(self):
-        # Dtype or device conversion would break the model.
-        raise NotImplementedError("Device or dtype conversion not supported!")
-
    def prefill(
        self,
        *,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor = None,
        position_ids: torch.Tensor,
-        predict_all_tokens: bool=True,
+        predict_all_tokens: bool = True,
    ) -> Tuple:
        batch_size, query_length = input_ids.shape

-        hidden_states = self.transformer.wte.forward(input_ids) + self.transformer.wpe.forward(position_ids)
+        hidden_states = self.transformer.wte.forward(
+            input_ids
+        ) + self.transformer.wpe.forward(position_ids)

        # Prefill (flash attn)
        # TODO: Unpad earlier (input ids)?
-        hidden_states, padding_index, sequence_lengths, key_length = unpad_input(hidden_states, attention_mask)
-        assert key_length==query_length
+        hidden_states, padding_index, sequence_lengths, key_length = unpad_input(
+            hidden_states, attention_mask
+        )
+        assert key_length == query_length

        residual = None
        past_key_values = []
-        block:GPTBigCodeBlock
+        block: GPTBigCodeBlock
        for block in self.transformer.h:
            hidden_states, residual, key_value = block.prefill(
                hidden_states,
@ -347,23 +417,39 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
                sequence_lengths=sequence_lengths,
                key_length=query_length,
            )
-            past_key_values.append(pad_input(key_value, padding_index, batch_size, query_length))
+            past_key_values.append(
+                pad_input(key_value, padding_index, batch_size, query_length)
+            )

-        hidden_states = self.transformer.ln_f.forward(hidden_states, residual)
+        hidden_states, *_ = self.transformer.ln_f.forward(hidden_states, residual)

        # Next bit is the memory bottleneck with predict_all_tokens so we free as much memory as possible.
        del residual

        if predict_all_tokens:
            hidden_states = self.lm_head.forward(hidden_states)
-            hidden_states = pad_input(hidden_states, padding_index, batch_size, query_length)
+            hidden_states = pad_input(
+                hidden_states, padding_index, batch_size, query_length
+            )
        else:
            # TODO: Index directly instead
-            hidden_states = pad_input(hidden_states, padding_index, batch_size, query_length)[:, -1]
+            hidden_states = pad_input(
+                hidden_states, padding_index, batch_size, query_length
+            )[:, -1]
            hidden_states = self.lm_head.forward(hidden_states).unsqueeze_(1)

        return hidden_states, past_key_values

+    def post_load_weights(self):
+        layer: GPTBigCodeBlock
+        for layer in self.transformer.h:
+            layer.attn.mask_value = self.mask_value
+            layer.attn.c_attn.weight.t_()
+            layer.attn.c_proj.weight.t_()
+            layer.mlp.c_fc.weight.t_()
+            layer.mlp.c_proj.weight.t_()
+        self.lm_head.weight.t_()
+
    def decode(
        self,
        *,
@ -371,26 +457,30 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
        past_key_values: List[torch.Tensor],
        attention_mask: [torch.Tensor],
        position_ids: torch.Tensor,
-        key_length:int,
+        key_length: int,
    ) -> Tuple:
-
        batch_size, query_length = input_ids.shape
        assert query_length == 1

-        hidden_states = self.transformer.wte.forward(input_ids) + self.transformer.wpe.forward(position_ids)
+        hidden_states = self.transformer.wte.forward(
+            input_ids
+        ) + self.transformer.wpe.forward(position_ids)

        # Standardize shape to (batch_size, hidden_size)
        hidden_states.squeeze_(1)

        # Self-attention mask (padding + causal).
        # TODO: Avoid unsqueeze
-        attention_mask = self.transformer.causal_mask[None, key_length - 1: key_length,
-                         :key_length] * attention_mask.unsqueeze(1)
+        attention_mask = self.causal_mask[
+            None, key_length - 1 : key_length, : attention_mask.size(-1)
+        ] * attention_mask.unsqueeze(1)
        attention_mask.unsqueeze_(2)

        residual = None
-        block:GPTBigCodeBlock
-        for i, (block, layer_past) in enumerate(zip(self.transformer.h, past_key_values)):
+        block: GPTBigCodeBlock
+        for i, (block, layer_past) in enumerate(
+            zip(self.transformer.h, past_key_values)
+        ):
            hidden_states, residual, past_key_values[i] = block.decode(
                hidden_states,
                residual=residual,
@ -400,7 +490,7 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
                key_length=key_length,
            )

-        hidden_states = self.transformer.ln_f.forward(hidden_states, residual)
+        hidden_states, *_ = self.transformer.ln_f.forward(hidden_states, residual)
        hidden_states = self.lm_head.forward(hidden_states).unsqueeze_(1)

        return hidden_states, past_key_values
--- a/server/text_generation_server/models/custom_modeling/gpt_bigcode3_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/gpt_bigcode3_modeling.py
@ -26,6 +26,7 @@ from transformers.models.gpt_bigcode.configuration_gpt_bigcode import (
    GPTBigCodeConfig,
 )

+
 class InferenceRunnerType(IntEnum):
    NO_RUNNER = 0
    # Use the inference runner without cuda graphs.
@ -38,6 +39,7 @@ class InferenceRunnerType(IntEnum):
    # Crashes with jit on A100 but seems to work without jit (PYTORCH_JIT=0) and on V100.
    FULL_GRAPH = 3

+
 try:
    from flash_attn.bert_padding import pad_input, unpad_input
    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
@ -52,7 +54,11 @@ logger = logging.get_logger(__name__)

@torch.jit.script
 def upcast_masked_softmax(
-    x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    mask_value: torch.Tensor,
+    scale: float,
+    softmax_dtype: torch.dtype,
 ):
    input_dtype = x.dtype
    x = x.to(softmax_dtype) * scale
@ -91,7 +97,7 @@ def softmax_function(
    and scaling, but only work well when the key length is a multiple of 8. For other key lengths, it is extremely
    inefficient.
    """
-    #assert x.size(-1) % 8 == 0
+    # assert x.size(-1) % 8 == 0
    if upcast:
        if mask is None:
            return upcast_softmax(x, scale, softmax_dtype)
@ -105,7 +111,13 @@ def softmax_function(


 class GPTBigCodeAttention(nn.Module):
-    def __init__(self, config, layer_idx=None, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        layer_idx=None,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__()
        self.mask_value = None
        self.embed_dim = config.hidden_size
@ -115,14 +127,27 @@ class GPTBigCodeAttention(nn.Module):

        # KV caching and padding

-        self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.head_dim, dtype=dtype, device=device)
-        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, dtype=dtype, device=device)
+        self.c_attn = nn.Linear(
+            self.embed_dim,
+            self.embed_dim + 2 * self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+        self.c_proj = nn.Linear(
+            self.embed_dim, self.embed_dim, dtype=dtype, device=device
+        )

    @torch.profiler.record_function("GPTBigCodeAttention._get_mask_value")
    def _get_mask_value(self, device, dtype):
        # torch.where expects a tensor. We use a cache to avoid recreating it every time.
-        if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
-            self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
+        if (
+            self.mask_value is None
+            or self.mask_value.dtype != dtype
+            or self.mask_value.device != device
+        ):
+            self.mask_value = torch.full(
+                [], torch.finfo(dtype).min, dtype=dtype, device=device
+            )
        return self.mask_value

    @torch.profiler.record_function("GPTBigCodeAttention._attn")
@ -156,12 +181,16 @@ class GPTBigCodeAttention(nn.Module):
            beta = 1
        else:
            beta = 0
-        attn_weights = torch.baddbmm(attn_weights, query, key, beta=beta, alpha=scale_factor).view(attn_shape)
+        attn_weights = torch.baddbmm(
+            attn_weights, query, key, beta=beta, alpha=scale_factor
+        ).view(attn_shape)

        attn_weights = softmax_function(
            attn_weights,
            attention_mask,
-            None if attention_mask is None else self._get_mask_value(attn_weights.device, softmax_dtype),
+            None
+            if attention_mask is None
+            else self._get_mask_value(attn_weights.device, softmax_dtype),
            unscale,
            softmax_dtype,
            upcast,
@ -172,7 +201,6 @@ class GPTBigCodeAttention(nn.Module):

    @torch.profiler.record_function("GPTBigCodeAttention._attn_flash")
    def _attn_flash(self, query, key, value, flash_params):
-
        query_shape = query.shape
        attn_shape = query_shape[0], self.num_heads, self.head_dim
        query = query.view(attn_shape)
@ -199,11 +227,12 @@ class GPTBigCodeAttention(nn.Module):

    @torch.profiler.record_function("GPTBigCodeAttention._merge_kv_caches")
    def _merge_kv_caches(self, key_value, layer_past, attention_mask, flash_params):
-
        # Convert to standard KV cache format.
        if flash_params is not None:
            _, padding_index, batch_size, max_sequence_length = flash_params
-            current_kv_cache = pad_input(key_value, padding_index, batch_size, max_sequence_length)
+            current_kv_cache = pad_input(
+                key_value, padding_index, batch_size, max_sequence_length
+            )
            return key_value, (current_kv_cache, max_sequence_length)

        current_kv_cache = key_value
@ -257,19 +286,23 @@ class GPTBigCodeAttention(nn.Module):
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
-        flash_params: Optional[Tuple] = None
+        flash_params: Optional[Tuple] = None,
    ) -> Tuple[torch.Tensor, Any]:
-        query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.head_dim), dim=-1)
+        query, key_value = self.c_attn(hidden_states).split(
+            (self.embed_dim, 2 * self.head_dim), dim=-1
+        )

        # present =  (allocated_kv_cache, key_length)
-        key_value, present = self._merge_kv_caches(key_value, layer_past, attention_mask, flash_params)
+        key_value, present = self._merge_kv_caches(
+            key_value, layer_past, attention_mask, flash_params
+        )

        key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)

        if flash_params is None:
-            attn_output=self._attn(query, key, value, attention_mask)
+            attn_output = self._attn(query, key, value, attention_mask)
        else:
-            attn_output=self._attn_flash(query, key, value, flash_params)
+            attn_output = self._attn_flash(query, key, value, flash_params)

        attn_output = self.c_proj(attn_output)

@ -287,15 +320,35 @@ class GPTBigCodeMLP(nn.Module):
    @torch.profiler.record_function("GPTBigCodeMLP.forward")
    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
    def forward(self, hidden_states: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
-        return self.c_proj(nn.functional.gelu(self.c_fc(hidden_states), approximate="tanh"))
+        return self.c_proj(
+            nn.functional.gelu(self.c_fc(hidden_states), approximate="tanh")
+        )


 class GPTBigCodeBlock(nn.Module):
-    def __init__(self, config, layer_idx=None, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        layer_idx=None,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__()
-        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, dtype=dtype, device=device)
-        self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx, dtype=dtype, device=device)
-        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, dtype=dtype, device=device)
+        self.ln_1 = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_epsilon,
+            dtype=dtype,
+            device=device,
+        )
+        self.attn = GPTBigCodeAttention(
+            config, layer_idx=layer_idx, dtype=dtype, device=device
+        )
+        self.ln_2 = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_epsilon,
+            dtype=dtype,
+            device=device,
+        )
        self.mlp = GPTBigCodeMLP(config)

    @torch.profiler.record_function("GPTBigCodeBlock.forward")
@ -304,10 +357,10 @@ class GPTBigCodeBlock(nn.Module):
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
-        flash_params: Optional[Tuple] = None
+        flash_params: Optional[Tuple] = None,
    ) -> Tuple[torch.Tensor, Any]:
        with torch.profiler.record_function("GPTBigCodeAttention.ln"):
-            ai=self.ln_1(hidden_states)
+            ai = self.ln_1(hidden_states)
        attn_output, present = self.attn(
            ai,
            layer_past=layer_past,
@ -320,14 +373,13 @@ class GPTBigCodeBlock(nn.Module):
        with torch.profiler.record_function("GPTBigCodeAttention.dummy"):
            pass
        with torch.profiler.record_function("GPTBigCodeAttention.ln"):
-            ai=self.ln_2(hidden_states)
-        ai=self.mlp(ai)
+            ai = self.ln_2(hidden_states)
+        ai = self.mlp(ai)
        with torch.profiler.record_function("GPTBigCodeAttention.residual"):
            hidden_states.add_(ai)
        return hidden_states, present


-
 class GPTBigCodePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@ -354,7 +406,10 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
            #
            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
            module.c_proj.weight.data.normal_(
-                mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
+                mean=0.0,
+                std=(
+                    self.config.initializer_range / math.sqrt(2 * self.config.n_layer)
+                ),
            )
            module.c_proj._is_hf_initialized = True
        elif isinstance(module, nn.Linear):
@ -373,18 +428,42 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):


 class GPTBigCodeModel(GPTBigCodePreTrainedModel):
-    def __init__(self, config, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__(config)

-        self.wte = nn.Embedding(config.vocab_size, config.hidden_size, dtype=dtype, device=device)
-        self.wpe = nn.Embedding(config.max_position_embeddings, config.hidden_size, dtype=dtype, device=device)
+        self.wte = nn.Embedding(
+            config.vocab_size, config.hidden_size, dtype=dtype, device=device
+        )
+        self.wpe = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            dtype=dtype,
+            device=device,
+        )

-        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i, dtype=dtype, device=device) for i in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(config.hidden_size, dtype=dtype, device=device, eps=config.layer_norm_epsilon)
+        self.h = nn.ModuleList(
+            [
+                GPTBigCodeBlock(config, layer_idx=i, dtype=dtype, device=device)
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = nn.LayerNorm(
+            config.hidden_size,
+            dtype=dtype,
+            device=device,
+            eps=config.layer_norm_epsilon,
+        )

-        self.inference_runner_type = InferenceRunnerType.NO_RUNNER #InferenceRunnerType(config.inference_runner)
+        self.inference_runner_type = (
+            InferenceRunnerType.NO_RUNNER
+        )  # InferenceRunnerType(config.inference_runner)

-        self.flash_attention = True #config.flash_attention
+        self.flash_attention = True  # config.flash_attention

        if self.flash_attention:
            if flash_attn_unpadded_func is None:
@ -402,13 +481,20 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):

        # Causal mask
        self.register_buffer(
-            "bias", torch.empty((config.max_position_embeddings, config.max_position_embeddings), dtype=torch.bool, device=device)
+            "bias",
+            torch.empty(
+                (config.max_position_embeddings, config.max_position_embeddings),
+                dtype=torch.bool,
+                device=device,
+            ),
        )

-    #@torch.profiler.record_function("GPTBigCodeModel._get_causal_mask")
+    # @torch.profiler.record_function("GPTBigCodeModel._get_causal_mask")
    def _get_causal_mask(self, padding_mask, query_length, key_length):
        # Self-attention mask.
-        attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
+        attention_mask = self.bias[
+            None, key_length - query_length : key_length, :key_length
+        ]

        if padding_mask is not None:
            attention_mask = attention_mask * padding_mask.unsqueeze(1).to(
@ -416,12 +502,14 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
            )
        pad = -key_length % 8
        if pad > 0:
-            attention_mask = torch.nn.functional.pad(attention_mask, (0, pad), mode="constant", value=False)
+            attention_mask = torch.nn.functional.pad(
+                attention_mask, (0, pad), mode="constant", value=False
+            )

        # (batch_size, query_length, n_heads, key_length)
        return attention_mask.unsqueeze(2)

-    #@torch.profiler.record_function("GPTBigCodeModel.forward")
+    # @torch.profiler.record_function("GPTBigCodeModel.forward")
    def forward(
        self,
        *,
@ -433,7 +521,9 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
        if self.inference_runner is not None and past_key_values is not None:
            if self.config.validate_runner_input:
                assert past_key_values is not None
-            return self.inference_runner.forward(input_ids, attention_mask, position_ids, past_key_values)
+            return self.inference_runner.forward(
+                input_ids, attention_mask, position_ids, past_key_values
+            )

        batch_size, query_length = input_ids.shape

@ -444,30 +534,38 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
        else:
            past_length = past_key_values[0][1]

-
        hidden_states = self.wte(input_ids) + self.wpe(position_ids)

        # TODO: Unpad earlier (input ids), support unpadded input?
        if flash_attention:
-            hidden_states, padding_index, sequence_lengths, max_sequence_length = unpad_input(
-                hidden_states, attention_mask
+            (
+                hidden_states,
+                padding_index,
+                sequence_lengths,
+                max_sequence_length,
+            ) = unpad_input(hidden_states, attention_mask)
+            flash_params = (
+                sequence_lengths,
+                padding_index,
+                batch_size,
+                max_sequence_length,
            )
-            flash_params = (sequence_lengths, padding_index, batch_size, max_sequence_length)
-            attention_mask=None
+            attention_mask = None
        else:
            key_length = past_length + query_length
            # Self-attention mask (padding + causal).
-            attention_mask = self._get_causal_mask(attention_mask, query_length, key_length)
-            flash_params=None
+            attention_mask = self._get_causal_mask(
+                attention_mask, query_length, key_length
+            )
+            flash_params = None

        presents = []
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
-                flash_params=flash_params
+                flash_params=flash_params,
            )

            hidden_states = outputs[0]
@ -476,24 +574,33 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
        hidden_states = self.ln_f(hidden_states)

        if flash_attention:
-            hidden_states = pad_input(hidden_states, padding_index, batch_size, query_length)
+            hidden_states = pad_input(
+                hidden_states, padding_index, batch_size, query_length
+            )

        return hidden_states, presents


 class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
-    def __init__(self, config, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__(config)
-        meta=torch.device("meta")
+        meta = torch.device("meta")
        self.transformer = GPTBigCodeModel(config, dtype=dtype, device=meta)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False, dtype=dtype, device=meta)
+        self.lm_head = nn.Linear(
+            config.n_embd, config.vocab_size, bias=False, dtype=dtype, device=meta
+        )

        self.to_empty(device=device)

        # Initialize weights and apply final processing
        self.post_init()

-    #@torch.profiler.record_function("GPTBigCodeForCausalLM.forward")
+    # @torch.profiler.record_function("GPTBigCodeForCausalLM.forward")
    def forward(
        self,
        *,
@ -501,16 +608,15 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
        past_key_values: Optional[Union[List[torch.Tensor], int]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: torch.Tensor,
-        predict_all_tokens: bool=True,
+        predict_all_tokens: bool = True,
    ) -> Tuple:
-
-        hidden_states, presents=self.transformer(
+        hidden_states, presents = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
-            position_ids=position_ids
+            position_ids=position_ids,
        )
-        #with torch.profiler.record_function("GPTBigCodeForCausalLM.head"):
+        # with torch.profiler.record_function("GPTBigCodeForCausalLM.head"):
        if not predict_all_tokens:
            # We only care about the last token.
            hidden_states = hidden_states[:, -1:]
--- a/server/text_generation_server/models/custom_modeling/gpt_bigcode_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/gpt_bigcode_modeling.py
@ -20,14 +20,13 @@ import torch
 import torch.utils.checkpoint
 from torch import nn

-import dropout_layer_norm
-
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from transformers.models.gpt_bigcode.configuration_gpt_bigcode import (
    GPTBigCodeConfig,
 )

+
 class InferenceRunnerType(IntEnum):
    NO_RUNNER = 0
    # Use the inference runner without cuda graphs.
@ -41,7 +40,6 @@ class InferenceRunnerType(IntEnum):
    FULL_GRAPH = 3


-
 try:
    from flash_attn.bert_padding import pad_input, unpad_input
    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
@ -56,7 +54,11 @@ logger = logging.get_logger(__name__)

@torch.jit.script
 def upcast_masked_softmax(
-    x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    mask_value: torch.Tensor,
+    scale: float,
+    softmax_dtype: torch.dtype,
 ):
    input_dtype = x.dtype
    x = x.to(softmax_dtype) * scale
@ -94,7 +96,7 @@ def softmax_function(
    and scaling, but only work well when the key length is a multiple of 8. For other key lengths, it is extremely
    inefficient.
    """
-    #assert x.size(-1) % 8 == 0
+    # assert x.size(-1) % 8 == 0
    if upcast:
        if mask is None:
            return upcast_softmax(x, scale, softmax_dtype)
@ -108,7 +110,13 @@ def softmax_function(


 class GPTBigCodeAttention(nn.Module):
-    def __init__(self, config, layer_idx=None, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        layer_idx=None,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__()
        self.mask_value = None
        self.embed_dim = config.hidden_size
@ -118,13 +126,26 @@ class GPTBigCodeAttention(nn.Module):

        # KV caching and padding

-        self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.head_dim, dtype=dtype, device=device)
-        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, dtype=dtype, device=device)
+        self.c_attn = nn.Linear(
+            self.embed_dim,
+            self.embed_dim + 2 * self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+        self.c_proj = nn.Linear(
+            self.embed_dim, self.embed_dim, dtype=dtype, device=device
+        )

    def _get_mask_value(self, device, dtype):
        # torch.where expects a tensor. We use a cache to avoid recreating it every time.
-        if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
-            self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
+        if (
+            self.mask_value is None
+            or self.mask_value.dtype != dtype
+            or self.mask_value.device != device
+        ):
+            self.mask_value = torch.full(
+                [], torch.finfo(dtype).min, dtype=dtype, device=device
+            )
        return self.mask_value

    def _attn(self, query, key, value, attention_mask):
@ -157,12 +178,16 @@ class GPTBigCodeAttention(nn.Module):
            beta = 1
        else:
            beta = 0
-        attn_weights = torch.baddbmm(attn_weights, query, key, beta=beta, alpha=scale_factor).view(attn_shape)
+        attn_weights = torch.baddbmm(
+            attn_weights, query, key, beta=beta, alpha=scale_factor
+        ).view(attn_shape)

        attn_weights = softmax_function(
            attn_weights,
            attention_mask,
-            None if attention_mask is None else self._get_mask_value(attn_weights.device, softmax_dtype),
+            None
+            if attention_mask is None
+            else self._get_mask_value(attn_weights.device, softmax_dtype),
            unscale,
            softmax_dtype,
            upcast,
@ -172,7 +197,6 @@ class GPTBigCodeAttention(nn.Module):
        return attn_output

    def _attn_flash(self, query, key, value, flash_params):
-
        query_shape = query.shape
        attn_shape = query_shape[0], self.num_heads, self.head_dim
        query = query.view(attn_shape)
@ -198,11 +222,12 @@ class GPTBigCodeAttention(nn.Module):
        return attn_output

    def _merge_kv_caches(self, key_value, layer_past, attention_mask, flash_params):
-
        # Convert to standard KV cache format.
        if flash_params is not None:
            _, padding_index, batch_size, max_sequence_length = flash_params
-            current_kv_cache = pad_input(key_value, padding_index, batch_size, max_sequence_length)
+            current_kv_cache = pad_input(
+                key_value, padding_index, batch_size, max_sequence_length
+            )
            return key_value, (current_kv_cache, max_sequence_length)

        current_kv_cache = key_value
@ -255,19 +280,23 @@ class GPTBigCodeAttention(nn.Module):
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
-        flash_params: Optional[Tuple] = None
+        flash_params: Optional[Tuple] = None,
    ) -> Tuple[torch.Tensor, Any]:
-        query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.head_dim), dim=-1)
+        query, key_value = self.c_attn(hidden_states).split(
+            (self.embed_dim, 2 * self.head_dim), dim=-1
+        )

        # present =  (allocated_kv_cache, key_length)
-        key_value, present = self._merge_kv_caches(key_value, layer_past, attention_mask, flash_params)
+        key_value, present = self._merge_kv_caches(
+            key_value, layer_past, attention_mask, flash_params
+        )

        key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)

        if flash_params is None:
-            attn_output=self._attn(query, key, value, attention_mask)
+            attn_output = self._attn(query, key, value, attention_mask)
        else:
-            attn_output=self._attn_flash(query, key, value, flash_params)
+            attn_output = self._attn_flash(query, key, value, flash_params)

        attn_output = self.c_proj(attn_output)

@ -284,15 +313,35 @@ class GPTBigCodeMLP(nn.Module):

    # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
    def forward(self, hidden_states: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
-        return self.c_proj(nn.functional.gelu(self.c_fc(hidden_states), approximate="tanh"))
+        return self.c_proj(
+            nn.functional.gelu(self.c_fc(hidden_states), approximate="tanh")
+        )


 class GPTBigCodeBlock(nn.Module):
-    def __init__(self, config, layer_idx=None, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        layer_idx=None,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__()
-        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, dtype=dtype, device=device)
-        self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx, dtype=dtype, device=device)
-        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, dtype=dtype, device=device)
+        self.ln_1 = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_epsilon,
+            dtype=dtype,
+            device=device,
+        )
+        self.attn = GPTBigCodeAttention(
+            config, layer_idx=layer_idx, dtype=dtype, device=device
+        )
+        self.ln_2 = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_epsilon,
+            dtype=dtype,
+            device=device,
+        )
        self.mlp = GPTBigCodeMLP(config)

    def forward(
@ -300,7 +349,7 @@ class GPTBigCodeBlock(nn.Module):
        hidden_states: torch.Tensor,
        layer_past: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
-        flash_params: Optional[Tuple] = None
+        flash_params: Optional[Tuple] = None,
    ) -> Tuple[torch.Tensor, Any]:
        attn_output, present = self.attn(
            self.ln_1(hidden_states),
@ -313,7 +362,6 @@ class GPTBigCodeBlock(nn.Module):
        return hidden_states, present


-
 class GPTBigCodePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@ -340,7 +388,10 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
            #
            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
            module.c_proj.weight.data.normal_(
-                mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
+                mean=0.0,
+                std=(
+                    self.config.initializer_range / math.sqrt(2 * self.config.n_layer)
+                ),
            )
            module.c_proj._is_hf_initialized = True
        elif isinstance(module, nn.Linear):
@ -359,18 +410,42 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):


 class GPTBigCodeModel(GPTBigCodePreTrainedModel):
-    def __init__(self, config, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__(config)

-        self.wte = nn.Embedding(config.vocab_size, config.hidden_size, dtype=dtype, device=device)
-        self.wpe = nn.Embedding(config.max_position_embeddings, config.hidden_size, dtype=dtype, device=device)
+        self.wte = nn.Embedding(
+            config.vocab_size, config.hidden_size, dtype=dtype, device=device
+        )
+        self.wpe = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            dtype=dtype,
+            device=device,
+        )

-        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i, dtype=dtype, device=device) for i in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(config.hidden_size, dtype=dtype, device=device, eps=config.layer_norm_epsilon)
+        self.h = nn.ModuleList(
+            [
+                GPTBigCodeBlock(config, layer_idx=i, dtype=dtype, device=device)
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = nn.LayerNorm(
+            config.hidden_size,
+            dtype=dtype,
+            device=device,
+            eps=config.layer_norm_epsilon,
+        )

-        self.inference_runner_type = InferenceRunnerType.NO_RUNNER #InferenceRunnerType(config.inference_runner)
+        self.inference_runner_type = (
+            InferenceRunnerType.NO_RUNNER
+        )  # InferenceRunnerType(config.inference_runner)

-        self.flash_attention = True #config.flash_attention
+        self.flash_attention = True  # config.flash_attention

        if self.flash_attention:
            if flash_attn_unpadded_func is None:
@ -388,12 +463,19 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):

        # Causal mask
        self.register_buffer(
-            "bias", torch.empty((config.max_position_embeddings, config.max_position_embeddings), dtype=torch.bool, device=device)
+            "bias",
+            torch.empty(
+                (config.max_position_embeddings, config.max_position_embeddings),
+                dtype=torch.bool,
+                device=device,
+            ),
        )

    def _get_causal_mask(self, padding_mask, query_length, key_length):
        # Self-attention mask.
-        attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
+        attention_mask = self.bias[
+            None, key_length - query_length : key_length, :key_length
+        ]

        if padding_mask is not None:
            attention_mask = attention_mask * padding_mask.unsqueeze(1).to(
@ -401,7 +483,9 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
            )
        pad = -key_length % 8
        if pad > 0:
-            attention_mask = torch.nn.functional.pad(attention_mask, (0, pad), mode="constant", value=False)
+            attention_mask = torch.nn.functional.pad(
+                attention_mask, (0, pad), mode="constant", value=False
+            )

        # (batch_size, query_length, n_heads, key_length)
        return attention_mask.unsqueeze(2)
@ -417,7 +501,9 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
        if self.inference_runner is not None and past_key_values is not None:
            if self.config.validate_runner_input:
                assert past_key_values is not None
-            return self.inference_runner.forward(input_ids, attention_mask, position_ids, past_key_values)
+            return self.inference_runner.forward(
+                input_ids, attention_mask, position_ids, past_key_values
+            )

        batch_size, query_length = input_ids.shape

@ -428,30 +514,38 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
        else:
            past_length = past_key_values[0][1]

-
        hidden_states = self.wte(input_ids) + self.wpe(position_ids)

        # TODO: Unpad earlier (input ids), support unpadded input?
        if flash_attention:
-            hidden_states, padding_index, sequence_lengths, max_sequence_length = unpad_input(
-                hidden_states, attention_mask
+            (
+                hidden_states,
+                padding_index,
+                sequence_lengths,
+                max_sequence_length,
+            ) = unpad_input(hidden_states, attention_mask)
+            flash_params = (
+                sequence_lengths,
+                padding_index,
+                batch_size,
+                max_sequence_length,
            )
-            flash_params = (sequence_lengths, padding_index, batch_size, max_sequence_length)
-            attention_mask=None
+            attention_mask = None
        else:
            key_length = past_length + query_length
            # Self-attention mask (padding + causal).
-            attention_mask = self._get_causal_mask(attention_mask, query_length, key_length)
-            flash_params=None
+            attention_mask = self._get_causal_mask(
+                attention_mask, query_length, key_length
+            )
+            flash_params = None

        presents = []
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
-                flash_params=flash_params
+                flash_params=flash_params,
            )

            hidden_states = outputs[0]
@ -460,17 +554,26 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
        hidden_states = self.ln_f(hidden_states)

        if flash_attention:
-            hidden_states = pad_input(hidden_states, padding_index, batch_size, query_length)
+            hidden_states = pad_input(
+                hidden_states, padding_index, batch_size, query_length
+            )

        return hidden_states, presents


 class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
-    def __init__(self, config, dtype:torch.dtype=torch.float32, device:torch.device=torch.device("cpu")):
+    def __init__(
+        self,
+        config,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ):
        super().__init__(config)
-        meta=torch.device("meta")
+        meta = torch.device("meta")
        self.transformer = GPTBigCodeModel(config, dtype=dtype, device=meta)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False, dtype=dtype, device=meta)
+        self.lm_head = nn.Linear(
+            config.n_embd, config.vocab_size, bias=False, dtype=dtype, device=meta
+        )

        self.to_empty(device=device)

@ -484,14 +587,13 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
        past_key_values: Optional[Union[List[torch.Tensor], int]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: torch.Tensor,
-        predict_all_tokens: bool=True,
+        predict_all_tokens: bool = True,
    ) -> Tuple:
-
-        hidden_states, presents=self.transformer(
+        hidden_states, presents = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
-            position_ids=position_ids
+            position_ids=position_ids,
        )

        if not predict_all_tokens:
--- a/server/text_generation_server/models/gpt_bigcode.py
+++ b/server/text_generation_server/models/gpt_bigcode.py
@ -6,8 +6,13 @@ from opentelemetry import trace
 from transformers import AutoTokenizer
 from typing import Optional, Type

-from text_generation_server.models.vectorized_causal_lm import VectorizedCausalLM,VectorizedCausalLMBatch
-from text_generation_server.models.custom_modeling.gpt_bigcode_modeling import GPTBigCodeForCausalLM
+from text_generation_server.models.vectorized_causal_lm import (
+    VectorizedCausalLM,
+    VectorizedCausalLMBatch,
+)
+from text_generation_server.models.custom_modeling.gpt_bigcode_modeling import (
+    GPTBigCodeForCausalLM,
+)


 tracer = trace.get_tracer(__name__)
@ -24,7 +29,9 @@ class BigcodeBatch(VectorizedCausalLMBatch):
                layer_kv.data = layer_kv[keep_indices, sequence_slice]

    @classmethod
-    def _concatenate_key_values(cls, batches, start_indices, end_indices, left_indices, max_input_length):
+    def _concatenate_key_values(
+        cls, batches, start_indices, end_indices, left_indices, max_input_length
+    ):
        device = batches[0].input_ids.device
        batch_size = sum([len(batch.requests) for batch in batches])

@ -35,13 +42,21 @@ class BigcodeBatch(VectorizedCausalLMBatch):
        past_key_values = []
        for kv_caches in zip(*(batch.past_key_values for batch in batches)):
            key_values, seq_lengths = zip(*kv_caches)
-            assert all(left_index + seq_length == max_input_length for left_index, seq_length in zip(left_indices, seq_lengths))
+            assert all(
+                left_index + seq_length == max_input_length
+                for left_index, seq_length in zip(left_indices, seq_lengths)
+            )

-            allocate_seq_len=max(left_index + key_value.size(1) for left_index, key_value in  zip(left_indices, key_values))
-            allocate_seq_len += - allocate_seq_len % 8
+            allocate_seq_len = max(
+                left_index + key_value.size(1)
+                for left_index, key_value in zip(left_indices, key_values)
+            )
+            allocate_seq_len += -allocate_seq_len % 8

            kv_cache = torch.empty(
-                (batch_size, allocate_seq_len, *key_values[0].shape[2:]), dtype=key_values[0].dtype, device=device
+                (batch_size, allocate_seq_len, *key_values[0].shape[2:]),
+                dtype=key_values[0].dtype,
+                device=device,
            )
            for key_value, start_index, end_index, left_index in zip(
                key_values,
@ -49,7 +64,9 @@ class BigcodeBatch(VectorizedCausalLMBatch):
                end_indices,
                left_indices,
            ):
-                kv_cache[start_index:end_index,left_index:max_input_length].copy_(key_value)
+                kv_cache[start_index:end_index, left_index:max_input_length].copy_(
+                    key_value
+                )
                # Set padding to zero to avoid propagating nans.
                kv_cache[start_index:end_index, :left_index].fill_(0)
                kv_cache[start_index:end_index, max_input_length:].fill_(0)
@ -58,6 +75,7 @@ class BigcodeBatch(VectorizedCausalLMBatch):
    def __len__(self):
        return len(self.requests)

+
 class BigcodeCausalLM(VectorizedCausalLM):
    def __init__(
        self,
@ -103,7 +121,7 @@ class BigcodeCausalLM(VectorizedCausalLM):
    def batch_type(self) -> Type[BigcodeBatch]:
        return BigcodeBatch

-    def forward(self, batch:BigcodeBatch):
+    def forward(self, batch: BigcodeBatch):
        key_length = batch.max_input_length
        query_length = key_length if batch.past_key_values is None else 1
        input_ids = batch.input_ids[:, key_length - query_length : key_length]
@ -113,7 +131,7 @@ class BigcodeCausalLM(VectorizedCausalLM):
            attention_mask=batch.attention_mask[:, :key_length],
            position_ids=batch.position_ids[:, key_length - query_length : key_length],
            past_key_values=batch.past_key_values,
-            use_cache=True,
+            predict_all_tokens=batch.details,
        )
        next_token_ids, logprobs = batch.next_token_chooser(
            input_ids, logits, batch.details
@ -126,10 +144,20 @@ class BigcodeCausalLM(VectorizedCausalLM):

        return next_token_ids, logprobs

-    def mock_kv_cache(self, batch: BigcodeBatch, dtype:Optional[torch.dtype]):
-        allocate_length=batch.max_input_length+-batch.max_input_length%8
-        return [(torch.empty(
-            [len(batch), allocate_length-1, 2 * self.model.config.n_embd // self.model.config.n_head],
-            dtype=dtype,
-            device=batch.input_ids.device,
-        ),batch.max_input_length-1) for _ in range(self.model.config.n_layer)]
+    def mock_kv_cache(self, batch: BigcodeBatch, dtype: Optional[torch.dtype]):
+        allocate_length = batch.max_input_length + -batch.max_input_length % 8
+        return [
+            (
+                torch.empty(
+                    [
+                        len(batch),
+                        allocate_length - 1,
+                        2 * self.model.config.n_embd // self.model.config.n_head,
+                    ],
+                    dtype=dtype,
+                    device=batch.input_ids.device,
+                ),
+                batch.max_input_length - 1,
+            )
+            for _ in range(self.model.config.n_layer)
+        ]
--- a/server/text_generation_server/models/gpt_bigcode2.py
+++ b/server/text_generation_server/models/gpt_bigcode2.py
@ -4,10 +4,17 @@ from dataclasses import dataclass

 from opentelemetry import trace
 from transformers import AutoTokenizer
-from typing import Optional, Type
+from typing import Optional, Type, List
+from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase

-from text_generation_server.models.vectorized_causal_lm import VectorizedCausalLM,VectorizedCausalLMBatch
-from text_generation_server.models.custom_modeling.gpt_bigcode2_modeling import GPTBigCodeForCausalLM
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.vectorized_causal_lm import (
+    VectorizedCausalLM,
+    VectorizedCausalLMBatch,
+)
+from text_generation_server.models.custom_modeling.gpt_bigcode2_modeling import (
+    GPTBigCodeForCausalLM,
+)

 tracer = trace.get_tracer(__name__)

@ -15,19 +22,40 @@ tracer = trace.get_tracer(__name__)
@dataclass
 class Bigcode2Batch(VectorizedCausalLMBatch):
    kv_cache_seq_dim: int = 1
-    pad_key_length_to_multiple:int=8
+    pad_key_length_to_multiple: int = 8

    # Prefill the attention mask for padded key length.
-    attention_mask_fill_value=False
+    attention_mask_fill_value = False
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        device: torch.device,
+    ) -> "Bigcode2Batch":
+        batch = super().from_pb(pb, tokenizer, device)
+        batch.attention_mask[:, batch.max_input_length :].fill_(False)
+        return batch

    def _filter_kv_caches(self, keep_indices, sequence_slice):
        if self.past_key_values is not None:
-            for layer_kv, _ in self.past_key_values:
+            for layer_kv in self.past_key_values:
                # Update tensors in-place to allow incremental garbage collection
                layer_kv.data = layer_kv[keep_indices, sequence_slice]

    @classmethod
-    def _concatenate_key_values(cls, batches, start_indices, end_indices, left_indices, max_input_length):
+    def concatenate(cls, batches: List["Bigcode2Batch"]) -> "Bigcode2Batch":
+        batch = super().concatenate(batches)
+        # Replace the attention mask with zeros to support padded key length.
+        # They are already filled with ones in super, but duplication is needed to generate the position ids.
+        batch.attention_mask[:, batch.max_input_length :].fill_(False)
+        return batch
+
+    @classmethod
+    def _concatenate_key_values(
+        cls, batches, start_indices, end_indices, left_indices, max_input_length
+    ):
        device = batches[0].input_ids.device
        batch_size = sum([len(batch.requests) for batch in batches])

@ -36,15 +64,19 @@ class Bigcode2Batch(VectorizedCausalLMBatch):
                raise ValueError("Only concatenate prefilled batches")

        past_key_values = []
-        for kv_caches in zip(*(batch.past_key_values for batch in batches)):
-            key_values, seq_lengths = zip(*kv_caches)
-            assert all(left_index + seq_length == max_input_length for left_index, seq_length in zip(left_indices, seq_lengths))
-
-            allocate_seq_len=max(left_index + key_value.size(1) for left_index, key_value in  zip(left_indices, key_values))
-            allocate_seq_len += - allocate_seq_len % batches[0].pad_key_length_to_multiple
+        for key_values in zip(*(batch.past_key_values for batch in batches)):
+            allocate_seq_len = max(
+                left_index + key_value.size(1)
+                for left_index, key_value in zip(left_indices, key_values)
+            )
+            allocate_seq_len += (
+                -allocate_seq_len % batches[0].pad_key_length_to_multiple
+            )

            kv_cache = torch.empty(
-                (batch_size, allocate_seq_len, *key_values[0].shape[2:]), dtype=key_values[0].dtype, device=device
+                (batch_size, allocate_seq_len, *key_values[0].shape[2:]),
+                dtype=key_values[0].dtype,
+                device=device,
            )
            for key_value, start_index, end_index, left_index in zip(
                key_values,
@ -52,16 +84,21 @@ class Bigcode2Batch(VectorizedCausalLMBatch):
                end_indices,
                left_indices,
            ):
-                kv_cache[start_index:end_index,left_index:max_input_length].copy_(key_value)
+                kv_cache[start_index:end_index, left_index:max_input_length].copy_(
+                    key_value
+                )
                # Set padding to zero to avoid propagating nans.
                kv_cache[start_index:end_index, :left_index].fill_(0)
                kv_cache[start_index:end_index, max_input_length:].fill_(0)
-            past_key_values.append((kv_cache, max_input_length))
+            past_key_values.append(kv_cache)

    def __len__(self):
        return len(self.requests)

+
 class Bigcode2CausalLM(VectorizedCausalLM):
+    model: GPTBigCodeForCausalLM
+
    def __init__(
        self,
        model_id: str,
@ -85,9 +122,12 @@ class Bigcode2CausalLM(VectorizedCausalLM):
            model_id,
            revision=revision,
            torch_dtype=dtype,
+            dtype=dtype,
            device_map="auto" if torch.cuda.is_available() else None,
            load_in_8bit=quantize == "bitsandbytes",
        )
+        model.post_load_weights()
+
        tokenizer.pad_token_id = (
            model.config.pad_token_id
            if model.config.pad_token_id is not None
@ -106,29 +146,33 @@ class Bigcode2CausalLM(VectorizedCausalLM):
    def batch_type(self) -> Type[Bigcode2Batch]:
        return Bigcode2Batch

-    def forward(self, batch:Bigcode2Batch):
+    def forward(self, batch: Bigcode2Batch):
        key_length = batch.max_input_length
        if batch.past_key_values is None:
            # Prefill (flash attn, unpadded key length)
-            batch.pad_key_length_to_multiple=self.model.pad_key_length_to_multiple
-            padded_key_length=key_length
-            query_length=key_length
+            input_ids = batch.input_ids[:, :key_length]
+            logits, batch.past_key_values = self.model.prefill(
+                input_ids=input_ids,
+                attention_mask=batch.attention_mask[:, :key_length],
+                position_ids=batch.position_ids[:, :key_length],
+                predict_all_tokens=batch.details,
+            )
        else:
            # Decode (fused attn, padded key length)
-            batch.attention_mask[:, key_length-1].fill_(True)
-            padded_key_length=key_length+-key_length%batch.pad_key_length_to_multiple
-            query_length=1
+            batch.attention_mask[:, key_length - 1].fill_(True)
+            padded_key_length = (
+                key_length + -key_length % batch.pad_key_length_to_multiple
+            )
+            input_ids = batch.input_ids[:, key_length - 1 : key_length]
+            # Model Forward
+            logits, batch.past_key_values = self.model.decode(
+                input_ids=input_ids,
+                attention_mask=batch.attention_mask[:, :padded_key_length],
+                position_ids=batch.position_ids[:, key_length - 1 : key_length],
+                past_key_values=batch.past_key_values,
+                key_length=key_length,
+            )

-        input_ids = batch.input_ids[:, key_length - query_length : key_length]
-        # Model Forward
-        logits, batch.past_key_values = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=batch.attention_mask[:, :padded_key_length],
-            position_ids=batch.position_ids[:, key_length - query_length : key_length],
-            past_key_values=batch.past_key_values,
-            key_length=key_length,
-            predict_all_tokens=batch.details
-        )
        next_token_ids, logprobs = batch.next_token_chooser(
            input_ids, logits, batch.details
        )
@ -140,10 +184,20 @@ class Bigcode2CausalLM(VectorizedCausalLM):

        return next_token_ids, logprobs

-    def mock_kv_cache(self, batch: Bigcode2Batch, dtype:Optional[torch.dtype]):
-        allocate_length=batch.max_input_length+-batch.max_input_length%batch.pad_key_length_to_multiple
-        return [(torch.empty(
-            [len(batch), allocate_length-1, 2 * self.model.config.n_embd // self.model.config.n_head],
-            dtype=dtype,
-            device=batch.input_ids.device,
-        ),batch.max_input_length-1) for _ in range(self.model.config.n_layer)]
+    def mock_kv_cache(self, batch: Bigcode2Batch, dtype: Optional[torch.dtype]):
+        allocate_length = (
+            batch.max_input_length
+            + -batch.max_input_length % batch.pad_key_length_to_multiple
+        )
+        return [
+            torch.randn(
+                [
+                    len(batch),
+                    allocate_length - 1,
+                    2 * self.model.config.n_embd // self.model.config.n_head,
+                ],
+                dtype=dtype,
+                device=batch.input_ids.device,
+            )
+            for _ in range(self.model.config.n_layer)
+        ]
--- a/server/text_generation_server/models/vectorized_causal_lm.py
+++ b/server/text_generation_server/models/vectorized_causal_lm.py
@ -58,9 +58,6 @@ class VectorizedCausalLMBatch(Batch):

    kv_cache_seq_dim: int = 2

-    # Prefill the attention mask for the generated tokens
-    attention_mask_fill_value=True
-
    # TODO: Get from requests (should these be lists?)
    details: bool = os.environ.get("RETURN_DETAILS") is not None
    generate_stream: bool = os.environ.get("GENERATE_STREAM") is not None
@ -116,7 +113,7 @@ class VectorizedCausalLMBatch(Batch):
        attention_mask = torch.empty(input_shape, dtype=torch.bool, device=device)
        # Copy tokenizer attention_mask into fully allocated attention_mask
        attention_mask[:, :max_input_length].copy_(tokenized_inputs["attention_mask"])
-        attention_mask[:, max_input_length:].fill_(cls.attention_mask_fill_value)
+        attention_mask[:, max_input_length:].fill_(True)

        position_ids = attention_mask.cumsum(-1).sub_(1)
        position_ids[:, :max_input_length].relu_()
@ -271,7 +268,10 @@ class VectorizedCausalLMBatch(Batch):
        # Allocate maximum attention_mask
        attention_mask = torch.empty(input_shape, dtype=torch.bool, device=device)
        attention_mask[:, :max_input_length].fill_(0)
-        attention_mask[:, max_input_length:].fill_(cls.attention_mask_fill_value)
+        attention_mask[:, max_input_length:].fill_(True)
+
+        position_ids = attention_mask.cumsum(-1).sub_(1)
+        position_ids[:, :max_input_length].relu_()

        input_ids = torch.empty(input_shape, dtype=torch.int64, device=device)
        # TODO : only needed for prefill
@ -287,16 +287,15 @@ class VectorizedCausalLMBatch(Batch):
                batch.input_ids[:, : batch.max_input_length]
            )

-        position_ids = attention_mask.cumsum(-1).sub_(1)
-        position_ids[:, :max_input_length].relu_()
-
        max_tokens = sum(
            batch.max_tokens + (max_input_length - batch.max_input_length) * len(batch)
            for batch in batches
        )

        kv_cache_seq_dim = batches[0].kv_cache_seq_dim
-        past_key_values=cls._concatenate_key_values(batches, start_indices, end_indices, left_indices, max_input_length)
+        past_key_values = cls._concatenate_key_values(
+            batches, start_indices, end_indices, left_indices, max_input_length
+        )

        return cls(
            batch_id=batches[0].batch_id,
@ -317,7 +316,9 @@ class VectorizedCausalLMBatch(Batch):
        )

    @classmethod
-    def _concatenate_key_values(cls, batches, start_indices, end_indices, left_indices, max_input_length):
+    def _concatenate_key_values(
+        cls, batches, start_indices, end_indices, left_indices, max_input_length
+    ):
        device = batches[0].input_ids.device
        batch_size = sum([len(batch.requests) for batch in batches])

@ -386,10 +387,10 @@ class VectorizedCausalLMBatch(Batch):

        return

-
    def __len__(self):
        return len(self.requests)

+
 class VectorizedCausalLM(Model):
    def __init__(
        self,
@ -441,7 +442,7 @@ class VectorizedCausalLM(Model):
            generated_ids, skip_special_tokens=True, cleanup_tokenization_spaces=False
        )

-    def forward(self, batch:VectorizedCausalLMBatch):
+    def forward(self, batch: VectorizedCausalLMBatch):
        key_length = batch.max_input_length
        query_length = key_length if batch.past_key_values is None else 1
        input_ids = batch.input_ids[:, key_length - query_length : key_length]
@ -499,7 +500,7 @@ class VectorizedCausalLM(Model):
                    prefill_token_ids, prefill_logprobs, batch.input_lengths
                ):
                    # Input length has already been incremented so we subtract 1.
-                    prefill_token_ids_ = prefill_token_ids_[-(input_length-1):]
+                    prefill_token_ids_ = prefill_token_ids_[-(input_length - 1) :]
                    prefill_texts = self.tokenizer.batch_decode(
                        prefill_token_ids_,
                        clean_up_tokenization_spaces=False,
@ -558,23 +559,42 @@ class VectorizedCausalLM(Model):

        return generations, next_batch

-    def mock_kv_cache(self, batch: VectorizedCausalLMBatch, dtype:Optional[torch.dtype]):
-        from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeForCausalLM
+    def mock_kv_cache(
+        self, batch: VectorizedCausalLMBatch, dtype: Optional[torch.dtype]
+    ):
+        from transformers.models.gpt_bigcode.modeling_gpt_bigcode import (
+            GPTBigCodeForCausalLM,
+        )
+
        if not isinstance(self.model, GPTBigCodeForCausalLM):
            raise NotImplementedError()
-        return [torch.empty(
-            [len(batch), batch.max_input_length-1, 2 * self.model.config.n_embd // self.model.config.n_head],
-            dtype=dtype,
-            device=batch.input_ids.device,
-        ) for _ in range(self.model.config.n_layer)]
+        return [
+            torch.empty(
+                [
+                    len(batch),
+                    batch.max_input_length - 1,
+                    2 * self.model.config.n_embd // self.model.config.n_head,
+                ],
+                dtype=dtype,
+                device=batch.input_ids.device,
+            )
+            for _ in range(self.model.config.n_layer)
+        ]

-    def fast_forward(self, batch: VectorizedCausalLMBatch, max_input_length: int, cache_dtype:Optional[torch.dtype]):
-        diff=max_input_length-batch.max_input_length
-        batch.input_ids[:, batch.max_input_length:max_input_length].fill_(self.tokenizer.pad_token_id)
+    def fast_forward(
+        self,
+        batch: VectorizedCausalLMBatch,
+        max_input_length: int,
+        cache_dtype: Optional[torch.dtype],
+    ):
+        diff = max_input_length - batch.max_input_length
+        batch.input_ids[:, batch.max_input_length : max_input_length].fill_(
+            self.tokenizer.pad_token_id
+        )
        batch.input_lengths = [length + diff for length in batch.input_lengths]
        batch.max_input_length += diff
        for stopping_criteria in batch.stopping_criterias:
-            stopping_criteria.current_tokens+=diff
-        batch.past_key_values = None if cache_dtype is None else self.mock_kv_cache(batch, cache_dtype)
-
-
+            stopping_criteria.current_tokens += diff
+        batch.past_key_values = (
+            None if cache_dtype is None else self.mock_kv_cache(batch, cache_dtype)
+        )