optional rust validation

2025-09-10 11:54:52 +00:00 · 2023-04-04 12:35:29 +02:00 · 2023-04-04 12:35:29 +02:00 · 47e93409f3
commit 47e93409f3
parent 45eacb782d
18 changed files with 453 additions and 270 deletions
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -27,7 +27,8 @@ serde = {version = "1.0.142", features = ["derive"]}
 serde_json = "1.0"
 text-generation-client = { path = "../router/client" }
 thiserror = "1.0.38"
-tokenizers = "0.13.2"
+#tokenizers = "0.13.2"
 tokenizers = { git = "https://github.com/huggingface/tokenizers.git" }
 tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
 tracing = "0.1.37"
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -63,10 +63,12 @@ message Request {
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
    /// Context truncation
    uint32 truncate = 3;
    /// Next Token Chooser Parameters
-    NextTokenChooserParameters parameters = 3;
+    NextTokenChooserParameters parameters = 4;
    /// Stopping Criteria Parameters
-    StoppingCriteriaParameters stopping_parameters = 4;
+    StoppingCriteriaParameters stopping_parameters = 5;
 }
 message Batch {
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -94,11 +94,11 @@ fn main() -> Result<(), std::io::Error> {
        if local_path.exists() && local_path.is_dir() && local_path.join("tokenizer.json").exists()
        {
            // Load local tokenizer
-            Tokenizer::from_file(local_path.join("tokenizer.json")).unwrap()
+            Tokenizer::from_file(local_path.join("tokenizer.json")).ok()
        } else {
            // Download and instantiate tokenizer
            // We need to download it outside of the Tokio runtime
-            Tokenizer::from_pretrained(tokenizer_name.clone(), None).unwrap()
+            Tokenizer::from_pretrained(tokenizer_name.clone(), None).ok()
        };
    // Launch Tokio runtime
@ -109,6 +109,13 @@ fn main() -> Result<(), std::io::Error> {
        .block_on(async {
            init_logging(otlp_endpoint, json_output);
            if tokenizer.is_none() {
                tracing::warn!(
                    "Could not find a fast tokenizer implementation for {tokenizer_name}"
                );
                tracing::warn!("Rust input length validation and truncation is disabled");
            }
            // Get pipeline tag
            let model_info = reqwest::get(format!(
                "https://huggingface.co/api/models/{tokenizer_name}"
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@ -174,6 +174,7 @@ impl State {
                batch_requests.push(Request {
                    id,
                    inputs: entry.request.inputs.clone(),
                    truncate: entry.request.truncate,
                    parameters: Some(entry.request.parameters.clone()),
                    stopping_parameters: Some(entry.request.stopping_parameters.clone()),
                });
@ -226,6 +227,7 @@ mod tests {
        Entry {
            request: ValidGenerateRequest {
                inputs: "".to_string(),
                truncate: 0,
                parameters: NextTokenChooserParameters {
                    temperature: 0.0,
                    top_k: 0,
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -455,7 +455,7 @@ pub async fn run(
    max_batch_size: usize,
    max_waiting_tokens: usize,
    client: ShardedClient,
-    tokenizer: Tokenizer,
+    tokenizer: Option<Tokenizer>,
    validation_workers: usize,
    addr: SocketAddr,
    allow_origin: Option<AllowOrigin>,
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -23,7 +23,7 @@ pub struct Validation {
 impl Validation {
    pub(crate) fn new(
        workers: usize,
-        tokenizer: Tokenizer,
+        tokenizer: Option<Tokenizer>,
        max_best_of: usize,
        max_stop_sequences: usize,
        max_input_length: usize,
@ -85,7 +85,7 @@ impl Validation {
 /// Load balance the validation requests between multiple validation workers
 async fn validation_task(
    workers: usize,
-    tokenizer: Tokenizer,
+    tokenizer: Option<Tokenizer>,
    max_stop_sequences: usize,
    max_input_length: usize,
    max_total_tokens: usize,
@ -95,7 +95,7 @@ async fn validation_task(
    // Create workers
    for _ in 0..workers {
-        let tokenizer_clone: Tokenizer = tokenizer.clone().into();
+        let tokenizer_clone: Option<Tokenizer> = tokenizer.clone().into();
        // Create channel to communicate with worker
        let (worker_sender, worker_receiver) = mpsc::channel(workers);
        workers_senders.push(worker_sender);
@ -127,7 +127,7 @@ async fn validation_task(
 /// Check the parameters inside the payload and get the number of tokens inside the input using
 /// the tokenizer
 fn validation_worker(
-    tokenizer: Tokenizer,
+    tokenizer: Option<Tokenizer>,
    max_stop_sequences: usize,
    max_input_length: usize,
    max_total_tokens: usize,
@ -143,7 +143,7 @@ fn validation_worker(
                .send(
                    validate(
                        request,
-                        &tokenizer,
+                        tokenizer.as_ref(),
                        max_stop_sequences,
                        max_input_length,
                        max_total_tokens,
@ -162,7 +162,7 @@ fn validation_worker(
 fn validate(
    request: GenerateRequest,
-    tokenizer: &Tokenizer,
+    tokenizer: Option<&Tokenizer>,
    max_stop_sequences: usize,
    max_input_length: usize,
    max_total_tokens: usize,
@ -272,35 +272,43 @@ fn validate(
        })
        .unwrap_or(Ok(None))?;
-    // Get the number of tokens in the input
+    // If we have a fast tokenizer
-    let mut encoding = tokenizer
+    let inputs = if let Some(tokenizer) = tokenizer {
-        .encode(request.inputs.clone(), true)
+        // Get the number of tokens in the input
-        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
+        let mut encoding = tokenizer
-
+            .encode(request.inputs.clone(), true)
    let (inputs, input_length) = if let Some(truncate) = truncate {
        // truncate encoding and decode new inputs
        encoding.truncate(truncate, 0, TruncationDirection::Left);
        let inputs = tokenizer
            .decode(Vec::from(encoding.get_ids()), false)
            .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
-        (inputs, encoding.len())
+
        let (inputs, input_length) = if let Some(truncate) = truncate {
            // truncate encoding and decode new inputs
            encoding.truncate(truncate, 0, TruncationDirection::Left);
            let inputs = tokenizer
                .decode(Vec::from(encoding.get_ids()), false)
                .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
            (inputs, encoding.len())
        } else {
            (request.inputs, encoding.len())
        };
        if input_length > max_input_length {
            return Err(ValidationError::InputLength(max_input_length, input_length));
        }
        let total_tokens = input_length + max_new_tokens as usize;
        if total_tokens > max_total_tokens {
            return Err(ValidationError::MaxTotalTokens(
                max_total_tokens,
                input_length,
                max_new_tokens,
            ));
        }
        metrics::histogram!("tgi_request_input_length", input_length as f64);
        inputs
    } else {
-        (request.inputs, encoding.len())
+        request.inputs
    };
    if input_length > max_input_length {
        return Err(ValidationError::InputLength(max_input_length, input_length));
    }
    let total_tokens = input_length + max_new_tokens as usize;
    if total_tokens > max_total_tokens {
        return Err(ValidationError::MaxTotalTokens(
            max_total_tokens,
            input_length,
            max_new_tokens,
        ));
    }
    // Return ValidGenerateRequest
    let parameters = NextTokenChooserParameters {
        temperature,
@ -318,11 +326,11 @@ fn validate(
        ignore_eos_token: false,
    };
    metrics::histogram!("tgi_request_input_length", input_length as f64);
    metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);
    Ok(ValidGenerateRequest {
        inputs,
        truncate: truncate.unwrap_or(max_input_length) as u32,
        parameters,
        stopping_parameters,
    })
@ -337,6 +345,7 @@ type ValidationRequest = (
 #[derive(Debug)]
 pub(crate) struct ValidGenerateRequest {
    pub inputs: String,
    pub truncate: u32,
    pub parameters: NextTokenChooserParameters,
    pub stopping_parameters: StoppingCriteriaParameters,
 }
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -66,6 +66,7 @@ class CausalLMBatch(Batch):
        stopping_criterias = []
        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        for r in pb.requests:
            inputs.append(r.inputs)
@ -74,6 +75,7 @@ class CausalLMBatch(Batch):
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            max_truncation = max(max_truncation, r.truncate)
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )
@ -83,6 +85,8 @@ class CausalLMBatch(Batch):
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        input_lengths = tokenized_inputs["attention_mask"].sum(1)
@ -388,6 +392,7 @@ class CausalLM(Model):
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text = self.decode_token(
                all_input_ids[-2, 0],
                next_token_id_squeezed,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -21,21 +21,18 @@
 import torch
 import torch.distributed
 from torch.nn import functional as F
 from torch import nn
 from transformers.activations import ACT2FN
 from text_generation_server.models.custom_modeling.tensor_parallel import (
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
 )
 from text_generation_server.models.custom_modeling.linear import FastLinear
 from text_generation_server.models.custom_modeling.rotary import PositionRotaryEmbedding
 # Flash attention imports
 import rotary_emb
 import flash_attn_cuda
 import dropout_layer_norm
 from flash_attn.layers.rotary import RotaryEmbedding
 class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
@ -87,6 +84,184 @@ class LlamaRMSNorm(nn.Module):
            return normed_hidden_states, res
 class FastLinear(nn.Linear):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        device=None,
        dtype=None,
    ) -> None:
        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
    def transpose_weight(self):
        self.weight = nn.Parameter(self.weight.T)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.bias is not None:
            return torch.addmm(self.bias, input, self.weight)
        return torch.matmul(input, self.weight)
 class TensorParallelColumnLinear(FastLinear):
    def __init__(
        self,
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        assert out_features % self.tp_world_size == 0
        out_features = out_features // self.tp_world_size
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias,
            device=device,
            dtype=dtype,
        )
 class TensorParallelRowLinear(FastLinear):
    def __init__(
        self,
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        reduce=True,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        self.reduce = reduce
        assert in_features % self.tp_world_size == 0
        in_features = in_features // self.tp_world_size
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias,
            device=device,
            dtype=dtype,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super(TensorParallelRowLinear, self).forward(input)
        if self.reduce:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class TensorParallelEmbedding(nn.Embedding):
    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        process_group: torch.distributed.ProcessGroup,
        padding_idx=None,
        max_norm=None,
        norm_type=2.0,
        scale_grad_by_freq=False,
        sparse=False,
        _weight=None,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.original_num_embeddings = num_embeddings
        assert num_embeddings % self.tp_world_size == 0
        block_size = num_embeddings // self.tp_world_size
        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
        self.min_id = self.tp_rank * block_size
        self.max_id = (self.tp_rank + 1) * block_size
        # Additional entry that will map to zero
        # Used for masking
        self.null_idx = block_size
        super().__init__(
            block_size,
            embedding_dim,
            padding_idx=padding_idx,
            max_norm=max_norm,
            norm_type=norm_type,
            scale_grad_by_freq=scale_grad_by_freq,
            sparse=sparse,
            _weight=_weight,
            device=device,
            dtype=dtype,
        )
    def add_null_idx(self):
        """Additional 0 entry used for masking"""
        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = super().forward(input)
        torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class PositionRotaryEmbedding(RotaryEmbedding):
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
        """
        Return cos and sin for the asked position ids
        """
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)
        return cos.unsqueeze(1), sin.unsqueeze(1)
    def forward(self, qkv: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
        rotary_dim = cos.shape[-1]
        q1 = qkv[:, 0, :, :rotary_dim]
        q2 = qkv[:, 0, :, rotary_dim : 2 * rotary_dim]
        k1 = qkv[:, 1, :, :rotary_dim]
        k2 = qkv[:, 1, :, rotary_dim : 2 * rotary_dim]
        rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
        rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        return qkv
 class FlashLlamaAttention(torch.nn.Module):
    def __init__(
        self,
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@ -21,23 +21,20 @@
 import torch
 import torch.distributed
 from torch.nn import functional as F
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.gpt_neox import GPTNeoXConfig
 from text_generation_server.models.custom_modeling.tensor_parallel import (
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
 )
 from text_generation_server.models.custom_modeling.linear import FastLinear
 from text_generation_server.models.custom_modeling.rotary import PositionRotaryEmbedding
 # Flash attention imports
 import rotary_emb
 import flash_attn_cuda
 import dropout_layer_norm
 from flash_attn.layers.rotary import RotaryEmbedding
 class FastLayerNorm(nn.LayerNorm):
    def forward(self, hidden_states, residual=None):
@ -75,6 +72,184 @@ class FastLayerNorm(nn.LayerNorm):
            return normed_hidden_states, residual
 class FastLinear(nn.Linear):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        device=None,
        dtype=None,
    ) -> None:
        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
    def transpose_weight(self):
        self.weight = nn.Parameter(self.weight.T)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.bias is not None:
            return torch.addmm(self.bias, input, self.weight)
        return torch.matmul(input, self.weight)
 class TensorParallelColumnLinear(FastLinear):
    def __init__(
        self,
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        assert out_features % self.tp_world_size == 0
        out_features = out_features // self.tp_world_size
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias,
            device=device,
            dtype=dtype,
        )
 class TensorParallelRowLinear(FastLinear):
    def __init__(
        self,
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        reduce=True,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        self.reduce = reduce
        assert in_features % self.tp_world_size == 0
        in_features = in_features // self.tp_world_size
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias,
            device=device,
            dtype=dtype,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super(TensorParallelRowLinear, self).forward(input)
        if self.reduce:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class TensorParallelEmbedding(nn.Embedding):
    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        process_group: torch.distributed.ProcessGroup,
        padding_idx=None,
        max_norm=None,
        norm_type=2.0,
        scale_grad_by_freq=False,
        sparse=False,
        _weight=None,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.original_num_embeddings = num_embeddings
        assert num_embeddings % self.tp_world_size == 0
        block_size = num_embeddings // self.tp_world_size
        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
        self.min_id = self.tp_rank * block_size
        self.max_id = (self.tp_rank + 1) * block_size
        # Additional entry that will map to zero
        # Used for masking
        self.null_idx = block_size
        super().__init__(
            block_size,
            embedding_dim,
            padding_idx=padding_idx,
            max_norm=max_norm,
            norm_type=norm_type,
            scale_grad_by_freq=scale_grad_by_freq,
            sparse=sparse,
            _weight=_weight,
            device=device,
            dtype=dtype,
        )
    def add_null_idx(self):
        """Additional 0 entry used for masking"""
        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = super().forward(input)
        torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class PositionRotaryEmbedding(RotaryEmbedding):
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
        """
        Return cos and sin for the asked position ids
        """
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)
        return cos.unsqueeze(1), sin.unsqueeze(1)
    def forward(self, qkv: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
        rotary_dim = cos.shape[-1]
        q1 = qkv[:, 0, :, :rotary_dim]
        q2 = qkv[:, 0, :, rotary_dim : 2 * rotary_dim]
        k1 = qkv[:, 1, :, :rotary_dim]
        k2 = qkv[:, 1, :, rotary_dim : 2 * rotary_dim]
        rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
        rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        return qkv
 class FlashNeoxAttention(torch.nn.Module):
    def __init__(
        self,
@ -201,12 +376,7 @@ class FlashMLP(nn.Module):
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
-            else lambda x: torch.nn.functional.gelu(
+            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
                x,
                approximate="tanh"
                if act in ["gelu_fast", "gelu_pytorch_tanh"]
                else None,
            )
        )
        if process_group is None:
--- a/server/text_generation_server/models/custom_modeling/linear.py
+++ b/server/text_generation_server/models/custom_modeling/linear.py
@ -1,22 +0,0 @@
 import torch
 from torch import nn
 class FastLinear(nn.Linear):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        device=None,
        dtype=None,
    ) -> None:
        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
    def transpose_weight(self):
        self.weight = nn.Parameter(self.weight.T)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.bias is not None:
            return torch.addmm(self.bias, input, self.weight)
        return torch.matmul(input, self.weight)
--- a/server/text_generation_server/models/custom_modeling/rotary.py
+++ b/server/text_generation_server/models/custom_modeling/rotary.py
@ -1,42 +0,0 @@
 import torch
 import rotary_emb
 from flash_attn.layers.rotary import RotaryEmbedding
 class PositionRotaryEmbedding(RotaryEmbedding):
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
        """
        Return cos and sin for the asked position ids
        """
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)
        return cos.unsqueeze(1), sin.unsqueeze(1)
    def forward(self, qkv: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
        rotary_dim = cos.shape[-1]
        q1 = qkv[:, 0, :, :rotary_dim]
        q2 = qkv[:, 0, :, rotary_dim : 2 * rotary_dim]
        k1 = qkv[:, 1, :, :rotary_dim]
        k2 = qkv[:, 1, :, rotary_dim : 2 * rotary_dim]
        rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
        rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        return qkv
--- a/server/text_generation_server/models/custom_modeling/tensor_parallel.py
+++ b/server/text_generation_server/models/custom_modeling/tensor_parallel.py
@ -1,124 +0,0 @@
 import torch
 import torch.distributed
 from torch import nn
 from torch.nn import functional as F
 from text_generation_server.models.custom_modeling.linear import FastLinear
 class TensorParallelColumnLinear(FastLinear):
    def __init__(
        self,
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        assert out_features % self.tp_world_size == 0
        out_features = out_features // self.tp_world_size
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias,
            device=device,
            dtype=dtype,
        )
 class TensorParallelRowLinear(FastLinear):
    def __init__(
        self,
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        reduce=True,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        self.reduce = reduce
        assert in_features % self.tp_world_size == 0
        in_features = in_features // self.tp_world_size
        super().__init__(
            in_features=in_features,
            out_features=out_features,
            bias=bias,
            device=device,
            dtype=dtype,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super(TensorParallelRowLinear, self).forward(input)
        if self.reduce:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class TensorParallelEmbedding(nn.Embedding):
    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        process_group: torch.distributed.ProcessGroup,
        padding_idx=None,
        max_norm=None,
        norm_type=2.0,
        scale_grad_by_freq=False,
        sparse=False,
        _weight=None,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.original_num_embeddings = num_embeddings
        assert num_embeddings % self.tp_world_size == 0
        block_size = num_embeddings // self.tp_world_size
        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
        self.min_id = self.tp_rank * block_size
        self.max_id = (self.tp_rank + 1) * block_size
        # Additional entry that will map to zero
        # Used for masking
        self.null_idx = block_size
        super().__init__(
            block_size,
            embedding_dim,
            padding_idx=padding_idx,
            max_norm=max_norm,
            norm_type=norm_type,
            scale_grad_by_freq=scale_grad_by_freq,
            sparse=sparse,
            _weight=_weight,
            device=device,
            dtype=dtype,
        )
    def add_null_idx(self):
        """Additional 0 entry used for masking"""
        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = super().forward(input)
        torch.distributed.all_reduce(out, group=self.process_group)
        return out
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -78,7 +78,9 @@ class FlashCausalLMBatch(Batch):
        # Parse batch
        for r in pb.requests:
-            tokenized_input = tokenizer(r.inputs)["input_ids"]
+            tokenized_input = tokenizer(
                r.inputs, truncation=True, max_length=r.truncate
            )["input_ids"]
            input_length = len(tokenized_input)
            max_seqlen = max(max_seqlen, input_length)
            input_lengths.append(input_length)
@ -333,6 +335,7 @@ class FlashCausalLM(Model):
            # Generated token
            next_token_logprob = logprobs[-1, next_token_id_item]
            next_token_text = self.decode_token(
                all_input_ids[-2],
                next_token_id_item,
            )
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@ -11,8 +11,6 @@ from typing import Optional, Tuple, List
 from text_generation_server.models import FlashCausalLM
 from text_generation_server.models.custom_modeling.flash_llama_modeling import (
    FlashLlamaForCausalLM,
 )
 from text_generation_server.models.custom_modeling.tensor_parallel import (
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@ -8,14 +8,12 @@ from transformers import AutoTokenizer, AutoConfig
 from typing import Optional, Tuple, List
 from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.tensor_parallel import (
+from text_generation_server.models.custom_modeling.flash_neox_modeling import (
    FlashGPTNeoXForCausalLM,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
 )
 from text_generation_server.models.custom_modeling.flash_neox_modeling import (
    FlashGPTNeoXForCausalLM,
 )
 from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@ -96,6 +96,7 @@ class GalacticaCausalLMBatch(CausalLMBatch):
        input_lengths = []
        # Parse batch
        max_truncation = 0
        max_sequence_length = 0
        padding_right_offset = 0
        for r in pb.requests:
@ -107,6 +108,7 @@ class GalacticaCausalLMBatch(CausalLMBatch):
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            max_truncation = max(max_truncation, r.truncate)
            max_sequence_length = max(max_sequence_length, r.input_length)
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
@ -118,6 +120,8 @@ class GalacticaCausalLMBatch(CausalLMBatch):
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        input_ids = tokenized_inputs["input_ids"]
        # Allocate maximum attention_mask
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -15,15 +15,6 @@ class Model(ABC):
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.device = device
        # see `decode_token` method
        self.tokenizer.add_special_tokens(
            {"additional_special_tokens": ["<decode-token>"]}
        )
        self.special_decode_token_id = self.tokenizer.convert_tokens_to_ids(
            "<decode-token>"
        )
        self.special_decode_token_length = len("<decode-token>")
    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
@ -33,11 +24,12 @@ class Model(ABC):
    def generate_token(self, batch: B) -> Tuple[List[GeneratedText], Optional[B]]:
        raise NotImplementedError
-    def decode_token(self, token_id: int) -> str:
+    def decode_token(self, previous_token_id: int, token_id: int) -> str:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-        # append token to special decode token and decode both
+        # Decode previous token and previous token + token
-        result = self.tokenizer.decode(
+        results = self.tokenizer.batch_decode(
-            [self.special_decode_token_id, token_id], skip_special_tokens=False
+            [[previous_token_id], [previous_token_id, token_id]],
            skip_special_tokens=False,
        )
-        # slice to remove special decode token
+        # slice to remove previous token
-        return result[self.special_decode_token_length :]
+        return results[1][len(results[0]) :]
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@ -73,6 +73,7 @@ class Seq2SeqLMBatch(Batch):
        decoder_input_lengths = []
        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        for r in pb.requests:
            inputs.append(r.inputs)
@ -84,6 +85,7 @@ class Seq2SeqLMBatch(Batch):
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            max_truncation = max(max_truncation, r.truncate)
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )
@ -94,6 +96,8 @@ class Seq2SeqLMBatch(Batch):
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        input_lengths = tokenized_inputs["attention_mask"].sum(1)
@ -463,6 +467,7 @@ class Seq2SeqLM(Model):
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text = self.decode_token(
                decoder_input_ids[-2],
                next_token_id_squeezed,
            )