From f092ba9b223a0582fa930981ad319d253d1ac765 Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Thu, 27 Apr 2023 10:16:35 -0700
Subject: [PATCH 01/39] feat(server): add watermarking tests (#248)

---
 .github/workflows/client-tests.yaml           |  2 +-
 .github/workflows/tests.yaml                  |  2 +-
 Makefile                                      |  9 +++-
 README.md                                     | 17 +++---
 clients/python/pyproject.toml                 |  5 +-
 clients/python/tests/test_client.py           | 13 +++++
 router/client/src/sharded_client.rs           |  2 +-
 router/src/health.rs                          |  8 ++-
 server/tests/utils/test_watermark.py          | 54 +++++++++++++++++++
 .../text_generation_server/models/__init__.py |  2 -
 10 files changed, 98 insertions(+), 16 deletions(-)
 create mode 100644 server/tests/utils/test_watermark.py

diff --git a/.github/workflows/client-tests.yaml b/.github/workflows/client-tests.yaml
index 7ccef3b0..1fa0b39d 100644
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@@ -22,4 +22,4 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest pytest-asyncio
-          cd clients/python && pytest tests
+          make python-client-tests
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 26cbf3b2..d9858a3b 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -66,7 +66,7 @@ jobs:
       - name: Run server tests
         run: |
           pip install pytest
-          HF_HUB_ENABLE_HF_TRANSFER=1 pytest -sv server/tests
+          make python-server-tests
       - name: Run Rust fmt
         run: |
           cargo fmt --check
diff --git a/Makefile b/Makefile
index 21fd11b5..032a49de 100644
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,13 @@ router-dev:
 integration-tests: install-router install-launcher
 	cargo test
 
-python-tests:
-	cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
+python-server-tests:
+	HF_HUB_ENABLE_HF_TRANSFER=1 pytest server/tests
+
+python-client-tests:
+	pytest clients/python/tests
+
+python-tests: python-server-tests python-client-tests
 
 run-bloom-560m:
 	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
diff --git a/README.md b/README.md
index 0c63f36b..351a83aa 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ to power LLMs api-inference widgets.
   - [Quantization](#quantization)
 - [Develop](#develop)
 - [Testing](#testing)
-  
+
 ## Features
 
 - Serve the most popular Large Language Models with a simple launcher
@@ -131,7 +131,7 @@ by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
 
 ### A note on Shared Memory (shm)
 
-[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by 
+[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
 `PyTorch` to do distributed training/inference. `text-generation-inference` make
 use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
 
@@ -152,14 +152,14 @@ creating a volume with:
 
 and mounting it to `/dev/shm`.
 
-Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that 
+Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
 this will impact performance.
 
 ### Local install
 
-You can also opt to install `text-generation-inference` locally. 
+You can also opt to install `text-generation-inference` locally.
 
-First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least 
+First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using `conda`:
 
 ```shell
@@ -181,7 +181,7 @@ sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
 rm -f $PROTOC_ZIP
 ```
 
-On MacOS, using Homebrew: 
+On MacOS, using Homebrew:
 
 ```shell
 brew install protobuf
@@ -241,6 +241,11 @@ make router-dev
 ## Testing
 
 ```shell
+# python
+make python-server-tests
+make python-client-tests
+# or both server and client tests
 make python-tests
+# rust cargo tests
 make integration-tests
 ```
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index d883af37..00bb99fc 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -21,6 +21,9 @@ pytest = "^6.2.5"
 pytest-asyncio = "^0.17.2"
 pytest-cov = "^3.0.0"
 
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+
 [build-system]
-requires = ["poetry-core"]
+requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
index d53c2a4d..8972dfd1 100644
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@@ -87,6 +87,19 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers):
     )
 
 
+@pytest.mark.asyncio
+async def test_generate_async_best_of(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+    response = await client.generate(
+        "test", max_new_tokens=1, best_of=2, do_sample=True
+    )
+
+    assert response.details.seed is not None
+    assert response.details.best_of_sequences is not None
+    assert len(response.details.best_of_sequences) == 1
+    assert response.details.best_of_sequences[0].seed is not None
+
+
 @pytest.mark.asyncio
 async def test_generate_async_not_found(fake_url, hf_headers):
     client = AsyncClient(fake_url, hf_headers)
diff --git a/router/client/src/sharded_client.rs b/router/client/src/sharded_client.rs
index 2f57a437..73524740 100644
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@@ -20,7 +20,7 @@ impl ShardedClient {
     /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
     async fn from_master_client(mut master_client: Client) -> Result<Self> {
         // Get all uris/unix sockets from the master client
-        let uris = master_client.service_discovery().await.unwrap();
+        let uris = master_client.service_discovery().await?;
         let futures = uris.into_iter().map(Client::connect_uds);
         let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
         Ok(Self::new(clients?))
diff --git a/router/src/health.rs b/router/src/health.rs
index 02edf328..45f50e9d 100644
--- a/router/src/health.rs
+++ b/router/src/health.rs
@@ -4,6 +4,10 @@ use text_generation_client::{
     Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
 };
 
+// Note: Request ids and batch ids cannot collide.
+const LIVENESS_ID: u64 = u64::MAX;
+const BATCH_ID: u64 = u64::MAX;
+
 #[derive(Clone, Debug)]
 pub(crate) struct Health {
     client: ShardedClient,
@@ -27,7 +31,7 @@ impl Health {
 
             // Dummy batch of 1 token and 1 generated token
             let liveness_request = Request {
-                id: u64::MAX,
+                id: LIVENESS_ID,
                 inputs: "liveness".to_string(),
                 truncate: 10,
                 parameters: Some(NextTokenChooserParameters {
@@ -47,7 +51,7 @@ impl Health {
                 }),
             };
             let batch = Batch {
-                id: u64::MAX,
+                id: BATCH_ID,
                 requests: vec![liveness_request],
                 size: 1,
                 max_tokens: 2,
diff --git a/server/tests/utils/test_watermark.py b/server/tests/utils/test_watermark.py
new file mode 100644
index 00000000..5d0aaf05
--- /dev/null
+++ b/server/tests/utils/test_watermark.py
@@ -0,0 +1,54 @@
+# test_watermark_logits_processor.py
+
+import os
+import numpy as np
+import torch
+from text_generation_server.utils.watermark import WatermarkLogitsProcessor
+
+
+GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
+DELTA = os.getenv("WATERMARK_DELTA", 2.0)
+
+
+def test_seed_rng():
+    input_ids = [101, 2036, 3731, 102, 2003, 103]
+    processor = WatermarkLogitsProcessor()
+    processor._seed_rng(input_ids)
+    assert isinstance(processor.rng, torch.Generator)
+
+
+def test_get_greenlist_ids():
+    input_ids = [101, 2036, 3731, 102, 2003, 103]
+    processor = WatermarkLogitsProcessor()
+    result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
+    assert max(result) <= 10
+    assert len(result) == int(10 * 0.5)
+
+
+def test_calc_greenlist_mask():
+    processor = WatermarkLogitsProcessor()
+    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
+    greenlist_token_ids = torch.tensor([2, 3])
+    result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
+    assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
+    assert result.shape == scores.shape
+
+
+def test_bias_greenlist_logits():
+    processor = WatermarkLogitsProcessor()
+    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
+    green_tokens_mask = torch.tensor(
+        [[False, False, True, True], [False, False, False, True]]
+    )
+    greenlist_bias = 2.0
+    result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
+    assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
+    assert result.shape == scores.shape
+
+
+def test_call():
+    input_ids = [101, 2036, 3731, 102, 2003, 103]
+    processor = WatermarkLogitsProcessor()
+    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
+    result = processor(input_ids, scores)
+    assert result.shape == scores.shape
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 74a7483e..221c9139 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -58,8 +58,6 @@ __all__ = [
     "GalacticaSharded",
     "GPTNeoxSharded",
     "Seq2SeqLM",
-    "Galactica",
-    "GalacticaSharded",
     "SantaCoder",
     "OPT",
     "OPTSharded",

From 593a5634142e52b537f9bfe1bbf131c7ade1185f Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Thu, 27 Apr 2023 19:18:33 +0200
Subject: [PATCH 02/39] feat(docker): add nvidia env vars (#255)

---
 Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index c17a9414..c65d3dee 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -164,6 +164,11 @@ FROM base as sagemaker
 COPY sagemaker-entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
 
+# NVIDIA env vars
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
 ENTRYPOINT ["./entrypoint.sh"]
 
 # Final image

From b0b97fd9a7d549421eadc1ab574f060a8e114b9c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Sat, 29 Apr 2023 11:53:42 +0200
Subject: [PATCH 03/39] doc(launcher): add more docs to the `launcher` itself
 and link in the README (#257)

---
 README.md            |   5 ++
 launcher/src/main.rs | 130 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/README.md b/README.md
index 351a83aa..712aea6a 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,11 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 ```
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
 
+To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli:
+```
+text-generation-launcher --help
+```
+
 You can then query the model using either the `/generate` or `/generate_stream` routes:
 
 ```shell
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 7b5f908a..77b0cf8a 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -18,52 +18,182 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
 struct Args {
+    /// The name of the model to load.
+    /// Can be a MODEL_ID as listed on <https://hf.co/models> like
+    /// `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`.
+    /// Or it can be a local directory containing the necessary files
+    /// as saved by `save_pretrained(...)` methods of transformers
     #[clap(default_value = "bigscience/bloom-560m", long, env)]
     model_id: String,
+
+    /// The actual revision of the model if you're referring to a model
+    /// on the hub. You can use a specific commit id or a branch like `refs/pr/2`.
     #[clap(long, env)]
     revision: Option<String>,
+
+    /// Wether to shard or not the model across multiple GPUs
+    /// By default text-generation-inference will use all available GPUs to run
+    /// the model. Setting it to `false` deactivates `num_shard`.
     #[clap(long, env)]
     sharded: Option<bool>,
+
+    /// The number of shards to use if you don't want to use all GPUs on a given machine.
+    /// You can use `CUDA_VISIBLE_DEVICE=0,1 text-generation-launcher... --num_shard 2`
+    /// and `CUDA_VISIBLE_DEVICE=2,3 text-generation-launcher... --num_shard 2` to
+    /// launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance.
     #[clap(long, env)]
     num_shard: Option<usize>,
+
+    /// Wether you want the model to be quantized or not. This will use bitsandbytes for
+    /// quantization on the fly.
     #[clap(long, env)]
     quantize: bool,
+
+    /// The maximum amount of concurrent requests for this particular deployment.
+    /// Having a low limit will refuse clients requests instead of having them
+    /// wait for too long and is usually good to handle backpressure correctly.
     #[clap(default_value = "128", long, env)]
     max_concurrent_requests: usize,
+
+    /// This is the maximum allowed value for clients to set `best_of`.
+    /// Best of makes `n` generations at the same time, and return the best
+    /// in terms of overall log probability over the entire generated sequence
     #[clap(default_value = "2", long, env)]
     max_best_of: usize,
+
+    /// This is the maximum allowed value for clients to set `stop_sequences`.
+    /// Stop sequences are used to allow the model to stop on more than just
+    /// the EOS token, and enable more complex "prompting" where users can preprompt
+    /// the model in a specific way and define their "own" stop token aligned with
+    /// their prompt.
     #[clap(default_value = "4", long, env)]
     max_stop_sequences: usize,
+
+    /// This is the maximum allowed input length (expressed in number of tokens)
+    /// for users. The larger this value, the longer prompt users can send which
+    /// can impact the overall memory required to handle the load.
+    /// Please note that some models have a finite range of sequence they can handle.
     #[clap(default_value = "1000", long, env)]
     max_input_length: usize,
+
+    /// This is the most important value to set as it defines the "memory budget"
+    /// of running clients requests.
+    /// Clients will send input sequences and ask to generate `max_new_tokens`
+    /// on top. with a value of `1512` users can send either a prompt of
+    /// `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for
+    /// `1511` max_new_tokens.
+    /// The larger this value, the larger amount each request will be in your RAM
+    /// and the less effective batching can be.
     #[clap(default_value = "1512", long, env)]
     max_total_tokens: usize,
+
+    /// The maximum allowed batch size during dynamic batching.
+    /// Using `max_batch_total_tokens` should be favored in general
+    /// as it's a finer way to control RAM usage.
     #[clap(long, env)]
     max_batch_size: Option<usize>,
+
+    /// This represents the ratio of waiting queries vs running queries where
+    /// you want to start considering pausing the running queries to include the waiting
+    /// ones into the same batch.
+    /// `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's
+    /// only 10 queries left in the current batch we check if we can fit those 12
+    /// waiting queries into the batching strategy, and if yes, then batching happens
+    /// delaying the 10 running queries by a `prefill` run.
+    ///
+    /// This setting is only applied if there is room in the batch
+    /// as defined by `max_batch_total_tokens`.
     #[clap(default_value = "1.2", long, env)]
     waiting_served_ratio: f32,
+
+    /// **IMPORTANT** This is one critical control to allow maximum usage
+    /// of the available hardware.
+    ///
+    /// This represents the total amount of potential tokens within a batch.
+    /// When using padding (not recommended) this would be equivalent of
+    /// `batch_size` * `max_total_tokens`.
+    ///
+    /// However in the non-padded (flash attention) version this can be much finer.
+    ///
+    /// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
+    /// or a single query of `1000` tokens.
+    ///
+    /// So you don't have to control that finely
+    /// `max_batch_size` or `max_total_tokens`. In fact you could mostly relax them if you
+    /// want maximum flexibility. However, for your users if they are asking for the full amount of
+    /// total tokens, they are likely to wait for a very long time to get a spot
+    /// in the batch (since they are going to be alone) so setting `max_batch_size`
+    /// and `max_total_tokens` can still be useful to prevent those long waiting times.
+    ///
+    /// Overall this number should be the largest possible amount that fits the
+    /// remaining memory (after the model is loaded). Since the actual memory overhead
+    /// depends on other parameters like if you're using quantization, flash attention
+    /// or the model implementation, text-generation-inference cannot infer this number
+    /// automatically.
     #[clap(default_value = "32000", long, env)]
     max_batch_total_tokens: u32,
+
+    /// This setting defines how many tokens can be passed before forcing the waiting
+    /// queries to be put on the batch (if the size of the batch allows for it).
+    /// New queries require 1 `prefill` forward, which is different from `decode`
+    /// and therefore you need to pause the running batch in order to run `prefill`
+    /// to create the correct values for the waiting queries to be able to join the batch.
+    ///
+    /// With a value too small, queries will always "steal" the compute to run `prefill`
+    /// and running queries will be delayed by a lot.
+    ///
+    /// With a value too big, waiting queries could wait for a very long time
+    /// before being allowed a slot in the running batch. If your server is busy
+    /// that means that requests that could run in ~2s on an empty server could
+    /// end up running in ~20s because the query had to wait for 18s.
+    ///
+    /// This number is expressed in number of tokens to make it a bit more
+    /// "model" agnostic, but what should really matter is the overall latency
+    /// for end users.
     #[clap(default_value = "20", long, env)]
     max_waiting_tokens: usize,
     #[clap(default_value = "3000", long, short, env)]
+
+    /// The port to listen on.
     port: u16,
+
+    /// The name of the socket for gRPC communication between the webserver
+    /// and the shards.
     #[clap(default_value = "/tmp/text-generation-server", long, env)]
     shard_uds_path: String,
+
+    /// The address the master shard will listen on. (setting used by torch distributed)
     #[clap(default_value = "localhost", long, env)]
     master_addr: String,
+
+    /// The address the master port will listen on. (setting used by torch distributed)
     #[clap(default_value = "29500", long, env)]
     master_port: usize,
+
+    /// The location of the huggingface hub cache.
+    /// Used to override the location if you want to provide a mounted disk for instance
     #[clap(long, env)]
     huggingface_hub_cache: Option<String>,
+
+    /// The location of the huggingface hub cache.
+    /// Used to override the location if you want to provide a mounted disk for instance
     #[clap(long, env)]
     weights_cache_override: Option<String>,
+
+    /// For some models (like bloom), text-generation-inference implemented custom
+    /// cuda kernels to speed up inference. Those kernels were only tested on A100.
+    /// Use this flag to disable them if you're running on different hardware and
+    /// encounter issues.
     #[clap(long, env)]
     disable_custom_kernels: bool,
+
+    /// Outputs the logs in JSON format (useful for telemetry)
     #[clap(long, env)]
     json_output: bool,
+
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
+
     #[clap(long, env)]
     cors_allow_origin: Vec<String>,
     #[clap(long, env)]

From 0e9d249b7948743743d5a96e56d6c4e1a3d011da Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Sat, 29 Apr 2023 12:17:30 +0200
Subject: [PATCH 04/39] feat(benchmark): add support for private tokenizers
 (#262)

---
 benchmark/src/main.rs | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 443af77f..03f61dcd 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -5,7 +5,7 @@
 use clap::Parser;
 use std::path::Path;
 use text_generation_client::ShardedClient;
-use tokenizers::Tokenizer;
+use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 use tracing_subscriber::EnvFilter;
@@ -16,6 +16,8 @@ use tracing_subscriber::EnvFilter;
 struct Args {
     #[clap(short, long, env)]
     tokenizer_name: String,
+    #[clap(default_value = "main", long, env)]
+    revision: String,
     #[clap(short, long)]
     batch_size: Option<Vec<u32>>,
     #[clap(default_value = "10", short, long, env)]
@@ -36,6 +38,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Pattern match configuration
     let Args {
         tokenizer_name,
+        revision,
         batch_size,
         sequence_length,
         decode_length,
@@ -59,10 +62,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             tracing::info!("Found local tokenizer");
             Tokenizer::from_file(local_path.join("tokenizer.json")).unwrap()
         } else {
+            tracing::info!("Downloading tokenizer");
+
+            // Parse Huggingface hub token
+            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+
             // Download and instantiate tokenizer
             // We need to download it outside of the Tokio runtime
-            tracing::info!("Downloading tokenizer");
-            Tokenizer::from_pretrained(tokenizer_name.clone(), None).unwrap()
+            let params = FromPretrainedParameters {
+                revision,
+                auth_token,
+                ..Default::default()
+            };
+            Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap()
         };
     tracing::info!("Tokenizer loaded");
 

From e86cca9723d8f7b9a17f11d716240d0ad3e0f875 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 1 May 2023 14:16:50 +0200
Subject: [PATCH 05/39] Adding docs on how dynamic batching works. (#258)

This PR starts the minimal possible amount of explanation I could think
of. It tries to explain how dynamic batching occurs, the interactions
with past key values and ignores the padding problem.

Maybe some drawings could help too but I kept it to text for now.
---
 README.md        |  3 +-
 router/README.md | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 router/README.md

diff --git a/README.md b/README.md
index 712aea6a..756d7e35 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,8 @@ to power LLMs api-inference widgets.
 - Serve the most popular Large Language Models with a simple launcher
 - Tensor Parallelism for faster inference on multiple GPUs
 - Token streaming using Server-Sent Events (SSE)
-- [Dynamic batching of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput
+- [Continous batching of incoming requests](https://github.com/huggingface/text-generation-inference/tree/main/router) for increased total throughput
+- Optimized transformers code for inference using [flash-attention](https://github.com/HazyResearch/flash-attention) on the most popular architectures
 - Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
 - Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
diff --git a/router/README.md b/router/README.md
new file mode 100644
index 00000000..c18d4f9e
--- /dev/null
+++ b/router/README.md
@@ -0,0 +1,93 @@
+# Router
+
+Also named `webserver` throughout the docs.
+
+This router is handling most of the logic to handle the "batches" tell
+when to pass new `prefill` requests and pausing `decode` requests, which ones etc...
+
+It uses gRPC to communicate with the shards which can therefore be kept
+much simpler and focus on having the most efficient forward passes as possible.
+
+## Continuous batching
+
+One important feature of `text-generation-inference` is enabled
+by this `router`.
+
+Continuous batching is the act of regularly running queries in the same
+`forward` step of the LLM (a "batch") and also removing them when they are
+finished.
+
+In order for continuous batching to be useful, you need to have more compute available
+with respect to the memory requirements of your model. This is essentially true for
+LLMs and the larger the model, the truer it gets (since you have to pool multiple
+GPUs to load the model, you effectively have a lot of compute power at your hands).
+
+
+Static batching is the act of doing several queries at the same time, but usually
+this is controlled by the client, and therefore the amount of batching is decided
+beforehand.
+
+For text-generation, and LLMs which are memory bound we can try to be much more
+efficient with the available compute, by having client sending us single queries, 
+and let the router mix&match queries into or out of batches to make the use the
+compute the most efficiently. This is possible because for LLMs the total compute
+for running the model is much bigger than doing mix&match of the batches themselves.
+
+
+### Simple continuous batching
+
+text-generation works by feeding a prompt to a model, and iteratively calling
+`forward` on the model to produce new text, 1 token at a time.
+
+The first idea is simple, when a query arrives, we start working on it directly.
+When new queries arrive, we simply wait for the current `forward` to be finished
+then batch the current running prompt with the new query, and call `forward`.
+
+Whenever either query is finished: either the model produce EOS (end of sentence) token
+or the query reached the allowed limit. We simply drop it from the batch, remove
+all the allocated memory and we can continue with the rest until nothing is left.
+
+This simple idea generalizes very well and we could potentially stack many requests
+in the same batch.
+
+One thing to note, is that queries can be potentially run with different parameters
+meaning different way to choose the next token (sampling, not sampling, temperature, top_k etc..). This is not problematic for the proposed approach we just need to do the sampling
+independantly on each member of the batch.
+
+### Prefill, decode and past key values
+
+In order to make LLMs and text-generation efficient, there's actually a very powerful
+trick that can be used, which is the "caching" of some attention matrices. [More on that
+in the first part of this blog](https://huggingface.co/blog/accelerated-inference#getting-to-the-first-10x-speedup)
+
+What this means, is that the first "pass" of a prompt is different from the subsequent
+"forward" passes. Since for the first one we have to compute the entire attention matrix, whereas in the follow-ups only require to compute the new token attention.
+The first pass is called `prefill` throughout this codebase where as the follow-ups are called `decode`.
+
+Since `prefill` is much more expensive than `decode` we don't want to do it all the time,
+but a currently running query is probably doing `decode`. If we want to do the continuous
+batching as explained previously we need to run `prefill` at some point in order to create
+the attention matrix required to be able to join the `decode` group.
+
+`text-generation-inference` uses a bunch of different strategies and parameters in
+order to enable you to find the sweet spot between exploiting the hardware and perceived latency.
+
+With no continuous batching at all, latency is going to be super good, but throughput (meaning
+the total number of requests allowed in a given timeframe) is going to be super bad (since it's essentially 1).
+
+With static batching, you can probably reach the maximum throughput (by using the maximum total batch size applicable to your hardware), but the latency is super bad since in order to have maximum throughput you need to wait for requests to come in before processing.
+
+With continuous batching you can find a sweet spot. In general latency is the most critical
+parameter users care about. But a 2x latency slowdown for 10x more users on the same
+hardware is an acceptable tradeoff.
+
+## Token streaming
+
+This is a very important aspect of client UX. As mentionned above, latency is the
+most critical perceived quality of an LLM API.
+
+With token streaming, the server can start answering after the first `prefill` pass
+directly, without waiting for all the generation to be done. For extremely long queries
+this means clients can start to see something happening orders of magnitude before
+the work is done. Seeing something in progress allows them to cut short if it's not 
+what's wanted but also it "feels" better.

From 411b0d4e1ffab5e8d20147b593058c068dcc17b5 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 2 May 2023 15:43:19 +0200
Subject: [PATCH 06/39] chore(github): add templates (#264)

---
 .github/ISSUE_TEMPLATE/bug-report.yml         | 67 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/config.yml             |  2 +
 .github/ISSUE_TEMPLATE/feature-request.yml    | 31 +++++++++
 .github/ISSUE_TEMPLATE/new-model-addition.yml | 31 +++++++++
 .github/PULL_REQUEST_TEMPLATE.md              | 40 +++++++++++
 .github/workflows/build.yaml                  |  1 +
 Cargo.lock                                    | 41 ++++++++++++
 Dockerfile                                    |  1 +
 launcher/Cargo.toml                           |  3 +
 launcher/build.rs                             | 29 ++++++++
 launcher/src/env_runtime.rs                   | 45 +++++++++++++
 launcher/src/main.rs                          | 11 +++
 router/build.rs                               |  7 ++
 router/src/lib.rs                             |  2 +
 router/src/server.rs                          |  1 +
 15 files changed, 312 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug-report.yml
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature-request.yml
 create mode 100644 .github/ISSUE_TEMPLATE/new-model-addition.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 launcher/build.rs
 create mode 100644 launcher/src/env_runtime.rs

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
new file mode 100644
index 00000000..12c93b9e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,67 @@
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve text-generation-inference
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: | 
+        Please share your system info with us (`text-generation-launcher --env` if installed locally).
+        The full command line used that causes issues: 
+        OS version:
+        Rust version (if self-compiling, `cargo version`):
+        Model being used (`curl 127.0.0.1:8080/info | jq`):
+          If local model please explicit the kind of model and/or equivalents.
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
+        The current version being used:
+
+      placeholder: text-generation-inference version, platform, python version, ...
+    validations:
+      required: true
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: 'The problem arises when using:'
+      options:
+        - label: "Docker"
+        - label: "The CLI directly"
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Tasks
+      description: "The thing I am working on is:"
+      options:
+        - label: "An officially supported command"
+        - label: "My own modifications"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        If you have code snippets, error messages, stack traces please provide them here as well.
+        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
+
+      placeholder: |
+        Steps to reproduce the behavior:
+          
+          1.
+          2.
+          3.
+          
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..e6477729
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,2 @@
+blank_issues_enabled: true
+version: 2.1
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
new file mode 100644
index 00000000..5abc1565
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -0,0 +1,31 @@
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new text-generation-inference feature
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
+
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
+        
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md)
diff --git a/.github/ISSUE_TEMPLATE/new-model-addition.yml b/.github/ISSUE_TEMPLATE/new-model-addition.yml
new file mode 100644
index 00000000..2f3476d3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/new-model-addition.yml
@@ -0,0 +1,31 @@
+name: "\U0001F31F New model addition"
+description: Submit a proposal/request to implement a new model
+labels: [ "New model" ]
+
+body:
+  - type: textarea
+    id: description-request
+    validations:
+      required: true
+    attributes:
+      label: Model description
+      description: |
+        Put any and all important information relative to the model
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Open source status
+      description: |
+          Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `transformers`.
+      options:
+        - label: "The model implementation is available"
+        - label: "The model weights are available"
+
+  - type: textarea
+    id: additional-info
+    attributes:
+      label: Provide useful links for the implementation
+      description: |
+        Please provide information regarding the implementation, the weights, and the authors.
+        Please mention the authors by @gh-username if you're aware of their usernames.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..ad5b98ab
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,40 @@
+# What does this PR do?
+
+<!--
+Congratulations! You've made it this far! You're not quite done yet though.
+
+Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
+
+Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
+
+Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
+-->
+
+<!-- Remove if not applicable -->
+
+Fixes # (issue)
+
+
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
+      Pull Request section?
+- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
+      to it if that's the case.
+- [ ] Did you make sure to update the documentation with your changes? Here are the
+      [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
+      [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
+- [ ] Did you write any new necessary tests?
+
+
+## Who can review?
+
+Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
+members/contributors who may be interested in your PR.
+
+<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @
+
+
+@OlivierDehaene OR @Narsil
+
+ -->
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67d1c730..289e4f67 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -97,6 +97,7 @@ jobs:
           platforms: 'linux/amd64'
           build-args: |
             GIT_SHA=${{ env.GITHUB_SHA }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
diff --git a/Cargo.lock b/Cargo.lock
index e240fe9f..58142d4b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1428,6 +1428,15 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "ntapi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "nu-ansi-term"
 version = "0.46.0"
@@ -2063,6 +2072,15 @@ dependencies = [
  "walkdir",
 ]
 
+[[package]]
+name = "rustc_version"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "rustix"
 version = "0.37.11"
@@ -2136,6 +2154,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "semver"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
+
 [[package]]
 name = "serde"
 version = "1.0.160"
@@ -2345,6 +2369,20 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
+[[package]]
+name = "sysinfo"
+version = "0.28.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4c2f3ca6693feb29a89724516f016488e9aafc7f37264f898593ee4b942f31b"
+dependencies = [
+ "cfg-if",
+ "core-foundation-sys",
+ "libc",
+ "ntapi",
+ "once_cell",
+ "winapi",
+]
+
 [[package]]
 name = "tar"
 version = "0.4.38"
@@ -2399,6 +2437,7 @@ dependencies = [
  "subprocess",
  "tracing",
  "tracing-subscriber",
+ "vergen",
 ]
 
 [[package]]
@@ -2985,7 +3024,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c1b86a8af1dedf089b1c78338678e4c7492b6045649042d94faf19690499d236"
 dependencies = [
  "anyhow",
+ "rustc_version",
  "rustversion",
+ "sysinfo",
  "time",
 ]
 
diff --git a/Dockerfile b/Dockerfile
index c65d3dee..d59acc10 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,6 +13,7 @@ RUN cargo chef prepare --recipe-path recipe.json
 FROM chef AS builder
 
 ARG GIT_SHA
+ARG DOCKER_LABEL
 
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index 5944a360..885dcd98 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -17,3 +17,6 @@ tracing-subscriber = { version = "0.3.16", features = ["json"] }
 [dev-dependencies]
 float_eq = "1.0.1"
 reqwest = { version = "0.11.14", features = ["blocking", "json"] }
+
+[build-dependencies]
+vergen = { version = "8.0.0", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
diff --git a/launcher/build.rs b/launcher/build.rs
new file mode 100644
index 00000000..71d2c0c5
--- /dev/null
+++ b/launcher/build.rs
@@ -0,0 +1,29 @@
+use std::error::Error;
+use vergen::EmitBuilder;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    // Emit cargo and rustc compile time values
+    EmitBuilder::builder().all_cargo().all_rustc().emit()?;
+
+    // Try to get the git sha from the local git repository
+    if EmitBuilder::builder()
+        .fail_on_error()
+        .git_sha(false)
+        .emit()
+        .is_err()
+    {
+        // Unable to get the git sha
+        if let Ok(sha) = std::env::var("GIT_SHA") {
+            // Set it from an env var
+            println!("cargo:rustc-env=VERGEN_GIT_SHA={sha}");
+        }
+    }
+
+    // Set docker label if present
+    if let Ok(label) = std::env::var("DOCKER_LABEL") {
+        // Set it from an env var
+        println!("cargo:rustc-env=DOCKER_LABEL={label}");
+    }
+
+    Ok(())
+}
diff --git a/launcher/src/env_runtime.rs b/launcher/src/env_runtime.rs
new file mode 100644
index 00000000..9dbc83f7
--- /dev/null
+++ b/launcher/src/env_runtime.rs
@@ -0,0 +1,45 @@
+use std::fmt;
+use std::process::Command;
+
+pub(crate) struct Env {
+    cargo_target: &'static str,
+    cargo_version: &'static str,
+    git_sha: &'static str,
+    docker_label: &'static str,
+    nvidia_env: String,
+}
+
+impl Env {
+    pub fn new() -> Self {
+        let nvidia_env = nvidia_smi();
+
+        Self {
+            nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
+            cargo_target: env!("VERGEN_CARGO_TARGET_TRIPLE"),
+            cargo_version: env!("VERGEN_RUSTC_SEMVER"),
+            git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
+            docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
+        }
+    }
+}
+
+impl fmt::Display for Env {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "Runtime environment:")?;
+
+        writeln!(f, "Target: {}", self.cargo_target)?;
+        writeln!(f, "Cargo version: {}", self.cargo_version)?;
+        writeln!(f, "Commit sha: {}", self.git_sha)?;
+        writeln!(f, "Docker label: {}", self.docker_label)?;
+        write!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
+
+        Ok(())
+    }
+}
+
+fn nvidia_smi() -> Option<String> {
+    let output = Command::new("nvidia-smi").output().ok()?;
+    let nvidia_smi = String::from_utf8(output.stdout).ok()?;
+    let output = nvidia_smi.replace('\n', "\n   ");
+    Some(output.trim().to_string())
+}
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 77b0cf8a..736aa5a7 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -14,6 +14,8 @@ use std::time::{Duration, Instant};
 use std::{fs, io};
 use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
 
+mod env_runtime;
+
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
@@ -200,6 +202,10 @@ struct Args {
     watermark_gamma: Option<f32>,
     #[clap(long, env)]
     watermark_delta: Option<f32>,
+
+    /// Display a lot of information about your runtime environment
+    #[clap(long, short, action)]
+    env: bool,
 }
 
 #[derive(Debug)]
@@ -829,6 +835,11 @@ fn main() -> Result<(), LauncherError> {
         tracing_subscriber::fmt().compact().init();
     }
 
+    if args.env {
+        let env_runtime = env_runtime::Env::new();
+        tracing::info!("{}", env_runtime);
+    }
+
     tracing::info!("{:?}", args);
 
     let num_shard = find_num_shards(args.sharded, args.num_shard);
diff --git a/router/build.rs b/router/build.rs
index 1b1fdc86..f5eb8a26 100644
--- a/router/build.rs
+++ b/router/build.rs
@@ -15,5 +15,12 @@ fn main() -> Result<(), Box<dyn Error>> {
             println!("cargo:rustc-env=VERGEN_GIT_SHA={sha}");
         }
     }
+
+    // Set docker label if present
+    if let Ok(label) = std::env::var("DOCKER_LABEL") {
+        // Set it from an env var
+        println!("cargo:rustc-env=DOCKER_LABEL={label}");
+    }
+
     Ok(())
 }
diff --git a/router/src/lib.rs b/router/src/lib.rs
index c2ff669b..080dc4f4 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -57,6 +57,8 @@ pub struct Info {
     pub version: &'static str,
     #[schema(nullable = true, example = "null")]
     pub sha: Option<&'static str>,
+    #[schema(nullable = true, example = "null")]
+    pub docker_label: Option<&'static str>,
 }
 
 #[derive(Clone, Debug, Deserialize, ToSchema)]
diff --git a/router/src/server.rs b/router/src/server.rs
index f25fa2b3..c5fc41ef 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -635,6 +635,7 @@ pub async fn run(
         validation_workers,
         version: env!("CARGO_PKG_VERSION"),
         sha: option_env!("VERGEN_GIT_SHA"),
+        docker_label: option_env!("DOCKER_LABEL"),
     };
 
     // Create router

From 4096000e34ded704355c55fb2b3e1b0b3cb6b3ba Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 3 May 2023 10:10:34 +0200
Subject: [PATCH 07/39] fix(server): fix typo in tokenizers decode (#269)

closes #268
---
 server/text_generation_server/models/causal_lm.py        | 2 +-
 server/text_generation_server/models/flash_causal_lm.py  | 2 +-
 server/text_generation_server/models/flash_santacoder.py | 2 +-
 server/text_generation_server/models/galactica.py        | 2 +-
 server/text_generation_server/models/santacoder.py       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 7dc7fb85..26a9a661 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -490,7 +490,7 @@ class CausalLM(Model):
 
     def decode(self, generated_ids: List[int]) -> str:
         return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, cleanup_tokenization_spaces=False
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
 
     def forward(
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 61ccca84..413866d1 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -402,7 +402,7 @@ class FlashCausalLM(Model):
 
     def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
         return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, cleanup_tokenization_spaces=False
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
 
     def forward(
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index aa1bdfb5..550be956 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -165,7 +165,7 @@ class FlashSantacoder(FlashCausalLM):
     def decode(self, generated_ids: List[int]) -> str:
         # Do not skip special tokens as they are used for custom parsing rules of the generated text
         return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, cleanup_tokenization_spaces=False
+            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
         )
 
 
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index c1ec1566..78e9bfe4 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -173,7 +173,7 @@ class Galactica(OPT):
     def decode(self, generated_ids: List[int]) -> str:
         # Do not skip special tokens as they are used for custom parsing rules of the generated text
         return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, cleanup_tokenization_spaces=False
+            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
         )
 
     def forward(
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
index 796c33e3..a7b09a82 100644
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -64,5 +64,5 @@ class SantaCoder(CausalLM):
     def decode(self, generated_ids: List[int]) -> str:
         # Do not skip special tokens as they are used for custom parsing rules of the generated text
         return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, cleanup_tokenization_spaces=False
+            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
         )

From 85aa7e2e7b02608eea04206b6cc0fa0ccced80ef Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 3 May 2023 11:36:24 +0200
Subject: [PATCH 08/39] feat(server): support hf endpoint weight layout (#266)

---
 launcher/src/main.rs                          | 24 +++++---
 server/text_generation_server/cli.py          | 55 ++++++++++++++-----
 server/text_generation_server/models/bloom.py |  9 +++
 .../models/flash_llama.py                     | 19 +++++++
 .../models/flash_neox.py                      | 10 ++++
 .../models/flash_santacoder.py                | 10 ++++
 .../models/galactica.py                       |  9 +++
 .../text_generation_server/models/gpt_neox.py |  9 +++
 server/text_generation_server/models/opt.py   |  9 +++
 server/text_generation_server/models/t5.py    |  9 +++
 server/text_generation_server/utils/hub.py    |  9 ++-
 11 files changed, 146 insertions(+), 26 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 736aa5a7..41211d8a 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -512,7 +512,11 @@ enum LauncherError {
     WebserverCannotStart,
 }
 
-fn download_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherError> {
+fn download_convert_model(
+    args: &Args,
+    auto_convert: bool,
+    running: Arc<AtomicBool>,
+) -> Result<(), LauncherError> {
     let mut download_argv = vec![
         "text-generation-server".to_string(),
         "download-weights".to_string(),
@@ -524,6 +528,11 @@ fn download_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherE
         "--json-output".to_string(),
     ];
 
+    // Auto convert weights to safetensors
+    if auto_convert {
+        download_argv.push("--auto-convert".to_string());
+    }
+
     // Model optional revision
     if let Some(revision) = &args.revision {
         download_argv.push("--revision".to_string());
@@ -855,14 +864,11 @@ fn main() -> Result<(), LauncherError> {
     })
     .expect("Error setting Ctrl-C handler");
 
-    // Check if model_id is a local model
-    let local_path = Path::new(&args.model_id);
-    let is_local_model = local_path.exists() && local_path.is_dir();
-
-    // Download weights for sharded models
-    if !is_local_model && args.weights_cache_override.is_none() && num_shard > 1 {
-        download_model(&args, running.clone())?;
-    }
+    // auto_convert is only needed for sharded models as we do not require safetensors in
+    // single shard mode
+    let auto_convert = num_shard > 1;
+    // Download and convert model weights
+    download_convert_model(&args, auto_convert, running.clone())?;
 
     // Shared shutdown bool
     let shutdown = Arc::new(Mutex::new(false));
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index 94340fac..92482a94 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -63,6 +63,7 @@ def download_weights(
     model_id: str,
     revision: Optional[str] = None,
     extension: str = ".safetensors",
+    auto_convert: bool = True,
     logger_level: str = "INFO",
     json_output: bool = False,
 ):
@@ -84,31 +85,55 @@ def download_weights(
     # Test if files were already download
     try:
         utils.weight_files(model_id, revision, extension)
-        logger.info(
-            "Files are already present in the local cache. " "Skipping download."
-        )
+        logger.info("Files are already present on the host. " "Skipping download.")
         return
     # Local files not found
-    except utils.LocalEntryNotFoundError:
+    except (utils.LocalEntryNotFoundError, FileNotFoundError):
         pass
 
-    # Download weights directly
+    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
+        "WEIGHTS_CACHE_OVERRIDE", None
+    ) is not None
+
+    if not is_local_model:
+        # Try to download weights from the hub
+        try:
+            filenames = utils.weight_hub_files(model_id, revision, extension)
+            utils.download_weights(filenames, model_id, revision)
+            # Successfully downloaded weights
+            return
+
+        # No weights found on the hub with this extension
+        except utils.EntryNotFoundError as e:
+            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
+            if not extension == ".safetensors" or not auto_convert:
+                raise e
+
+    # Try to see if there are local pytorch weights
     try:
-        filenames = utils.weight_hub_files(model_id, revision, extension)
-        utils.download_weights(filenames, model_id, revision)
-    except utils.EntryNotFoundError as e:
-        if not extension == ".safetensors":
-            raise e
+        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
+        local_pt_files = utils.weight_files(model_id, revision, ".bin")
 
-        logger.warning(
-            f"No safetensors weights found for model {model_id} at revision {revision}. "
-            f"Converting PyTorch weights instead."
-        )
+    # No local pytorch weights
+    except utils.LocalEntryNotFoundError:
+        if extension == ".safetensors":
+            logger.warning(
+                f"No safetensors weights found for model {model_id} at revision {revision}. "
+                f"Downloading PyTorch weights."
+            )
 
-        # Try to see if there are pytorch weights
+        # Try to see if there are pytorch weights on the hub
         pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
         # Download pytorch weights
         local_pt_files = utils.download_weights(pt_filenames, model_id, revision)
+
+    if auto_convert:
+        logger.warning(
+            f"No safetensors weights found for model {model_id} at revision {revision}. "
+            f"Converting PyTorch weights to safetensors."
+        )
+
+        # Safetensors final filenames
         local_st_files = [
             p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
             for p in local_pt_files
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index e43a4b79..f528a430 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -223,6 +223,15 @@ class BLOOMSharded(BLOOM):
                     if name == "word_embeddings.weight":
                         model.lm_head._parameters["weight"] = tensor
 
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index e640113b..105ff519 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -139,6 +139,15 @@ class FlashLlama(FlashCausalLM):
 
                 del value
 
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
         torch.cuda.empty_cache()
         model.post_load_weights(quantize)
 
@@ -300,5 +309,15 @@ class FlashLlamaSharded(FlashLlama):
 
                     else:
                         module._buffers[param_name] = tensor
+
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
         torch.cuda.empty_cache()
         model.post_load_weights(quantize)
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index eae584ac..fc769583 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -149,4 +149,14 @@ class FlashNeoXSharded(FlashNeoX):
                         module._parameters[param_name] = tensor
                     else:
                         module._buffers[param_name] = tensor
+
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
         model.post_load_weights(quantize)
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 550be956..333180e8 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -372,5 +372,15 @@ class FlashSantacoderSharded(FlashSantacoder):
                             module._parameters[param_name] = tensor
                     else:
                         module._buffers[param_name] = tensor
+
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
         torch.cuda.empty_cache()
         model.post_load_weights(quantize)
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index 78e9bfe4..2577f1b1 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -355,6 +355,15 @@ class GalacticaSharded(Galactica):
                     if name == "model.decoder.embed_tokens.weight":
                         model.lm_head._parameters["weight"] = tensor
 
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index 3b5fe2cc..e73a3c82 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -205,6 +205,15 @@ class GPTNeoxSharded(CausalLM):
                     else:
                         module._buffers[param_name] = tensor
 
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index 1a21186f..50e5271e 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -210,6 +210,15 @@ class OPTSharded(OPT):
                     if name == "model.decoder.embed_tokens.weight":
                         model.lm_head._parameters["weight"] = tensor
 
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 487a5984..9e8c3c4c 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -211,6 +211,15 @@ class T5Sharded(Seq2SeqLM):
                     else:
                         module._buffers[param_name] = tensor
 
+        uninitialized_parameters = []
+        for n, p in model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model: {uninitialized_parameters}"
+            )
+
     def forward(
         self,
         input_ids,
diff --git a/server/text_generation_server/utils/hub.py b/server/text_generation_server/utils/hub.py
index 4feec8a1..030c8289 100644
--- a/server/text_generation_server/utils/hub.py
+++ b/server/text_generation_server/utils/hub.py
@@ -77,7 +77,12 @@ def weight_files(
     """Get the local files"""
     # Local model
     if Path(model_id).exists() and Path(model_id).is_dir():
-        return list(Path(model_id).glob(f"*{extension}"))
+        local_files = list(Path(model_id).glob(f"*{extension}"))
+        if not local_files:
+            raise FileNotFoundError(
+                f"No local weights found in {model_id} with extension {extension}"
+            )
+        return local_files
 
     try:
         filenames = weight_hub_files(model_id, revision, extension)
@@ -98,7 +103,7 @@ def weight_files(
         for filename in filenames:
             p = Path(WEIGHTS_CACHE_OVERRIDE) / filename
             if not p.exists():
-                raise LocalEntryNotFoundError(
+                raise FileNotFoundError(
                     f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}."
                 )
             files.append(p)

From b67908e0cfee66d07552e1f8af6f36e8d993f97c Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 3 May 2023 23:39:35 +0200
Subject: [PATCH 09/39] fix(launcher): pass weights cache override to the
 download process (#274)

closes #273
---
 launcher/src/main.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 41211d8a..9f8d215b 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -542,7 +542,7 @@ fn download_convert_model(
     // Copy current process env
     let mut env: Vec<(OsString, OsString)> = env::vars_os().collect();
 
-    // If huggingface_hub_cache is set, pass it to the shard
+    // If huggingface_hub_cache is set, pass it to the download process
     // Useful when running inside a docker container
     if let Some(ref huggingface_hub_cache) = args.huggingface_hub_cache {
         env.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
@@ -560,6 +560,15 @@ fn download_convert_model(
         env.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
     };
 
+    // If args.weights_cache_override is some, pass it to the download process
+    // Useful when running inside a HuggingFace Inference Endpoint
+    if let Some(weights_cache_override) = &args.weights_cache_override {
+        env.push((
+            "WEIGHTS_CACHE_OVERRIDE".into(),
+            weights_cache_override.into(),
+        ));
+    };
+
     // Start process
     tracing::info!("Starting download process.");
     let mut download_process = match Popen::create(

From b4fe248b17ef041b72245b40c86a2157cf053a4b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 4 May 2023 15:14:28 +0200
Subject: [PATCH 10/39] fix(launcher): handle hub branches (#278)

---
 router/src/main.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/router/src/main.rs b/router/src/main.rs
index 28db6071..50932cd1 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -258,9 +258,10 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
 /// get model info from the Huggingface Hub
 pub async fn get_model_info(model_id: &str, revision: &str, token: Option<String>) -> HubModelInfo {
     let client = reqwest::Client::new();
-    let mut builder = client.get(format!(
-        "https://huggingface.co/api/models/{model_id}/revision/{revision}"
-    ));
+    // Poor man's urlencode
+    let revision = revision.replace("/", "%2F");
+    let url = format!("https://huggingface.co/api/models/{model_id}/revision/{revision}");
+    let mut builder = client.get(url);
     if let Some(token) = token {
         builder = builder.bearer_auth(token);
     }

From f08343d44d3da0b5fae80f5c283ac1b7bc6697e5 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 4 May 2023 15:22:54 +0200
Subject: [PATCH 11/39] fix(server): Removes the parallelism in file convertion
 (during download) (#275)

---
 .../text_generation_server/utils/convert.py   | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/server/text_generation_server/utils/convert.py b/server/text_generation_server/utils/convert.py
index 437e2308..8eddd76f 100644
--- a/server/text_generation_server/utils/convert.py
+++ b/server/text_generation_server/utils/convert.py
@@ -1,5 +1,6 @@
 import concurrent
 import time
+import datetime
 import torch
 
 from concurrent.futures import ThreadPoolExecutor
@@ -78,17 +79,9 @@ def convert_file(pt_file: Path, st_file: Path):
 def convert_files(pt_files: List[Path], st_files: List[Path]):
     assert len(pt_files) == len(st_files)
 
-    executor = ThreadPoolExecutor(max_workers=5)
-    futures = [
-        executor.submit(convert_file, pt_file=pt_file, st_file=st_file)
-        for pt_file, st_file in zip(pt_files, st_files)
-    ]
-
+    N = len(pt_files)
     # We do this instead of using tqdm because we want to parse the logs with the launcher
-    start_time = time.time()
-    for i, future in enumerate(concurrent.futures.as_completed(futures)):
-        elapsed = timedelta(seconds=int(time.time() - start_time))
-        remaining = len(futures) - (i + 1)
-        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0
-
-        logger.info(f"Convert: [{i + 1}/{len(futures)}] -- ETA: {eta}")
+    start = datetime.datetime.now()
+    for i, (pt_file, sf_file) in enumerate(zip(pt_files, st_files)):
+        elapsed = datetime.datetime.now() - start
+        logger.info(f"Convert: [{i + 1}/{N}] -- Took: {elapsed}")

From e68509add7ac04f8d6b4b75fab6fa65f47c2a76c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 4 May 2023 15:29:29 +0200
Subject: [PATCH 12/39] feat(launcher): Improve error message when download
 process fails. (#276)

---
 launcher/src/main.rs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 9f8d215b..867bfd3d 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -627,8 +627,21 @@ fn download_convert_model(
                         return Err(LauncherError::DownloadError);
                     }
                 }
-                _ => {
-                    tracing::error!("Download process exited with an unknown status.");
+                ExitStatus::Signaled(signal) => {
+                    let mut err = String::new();
+                    download_process
+                        .stderr
+                        .take()
+                        .unwrap()
+                        .read_to_string(&mut err)
+                        .unwrap();
+                    tracing::error!(
+                        "Download process was signaled to shutdown with signal {signal}: {err}"
+                    );
+                    return Err(LauncherError::DownloadError);
+                }
+                e => {
+                    tracing::error!("Download process exited with an unknown status.: {e:?}");
                     return Err(LauncherError::DownloadError);
                 }
             }

From 690fc31757a8dc10106e7f95fabe1397f21e0cc6 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 5 May 2023 15:28:08 +0200
Subject: [PATCH 13/39] fix(server): fix convert (#284)

---
 server/text_generation_server/utils/convert.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/utils/convert.py b/server/text_generation_server/utils/convert.py
index 8eddd76f..1bdedf58 100644
--- a/server/text_generation_server/utils/convert.py
+++ b/server/text_generation_server/utils/convert.py
@@ -81,7 +81,9 @@ def convert_files(pt_files: List[Path], st_files: List[Path]):
 
     N = len(pt_files)
     # We do this instead of using tqdm because we want to parse the logs with the launcher
-    start = datetime.datetime.now()
+    
     for i, (pt_file, sf_file) in enumerate(zip(pt_files, st_files)):
+        start = datetime.datetime.now()
+        convert_file(pt_file, sf_file)
         elapsed = datetime.datetime.now() - start
         logger.info(f"Convert: [{i + 1}/{N}] -- Took: {elapsed}")

From 3314a46d363e9bfcf14928dc83c677dd478ff42b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 5 May 2023 17:52:09 +0200
Subject: [PATCH 14/39] chore: add `flash-attention` to docker ignore (#287)

included when building docker locally.
(Where the local dirs might have the flash-attention folder.)


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 .dockerignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.dockerignore b/.dockerignore
index fcfaad0d..c69283ec 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,4 @@
 aml
 target
-server/transformers
\ No newline at end of file
+server/transformers
+server/flash-attention

From b4aa87db58ea3d87ff820b6d7abeaa20a987df00 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 5 May 2023 17:57:02 +0200
Subject: [PATCH 15/39] fea(server): decrease convert RAM requirements (#286)

---
 .../text_generation_server/utils/convert.py   | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/server/text_generation_server/utils/convert.py b/server/text_generation_server/utils/convert.py
index 1bdedf58..caf1a764 100644
--- a/server/text_generation_server/utils/convert.py
+++ b/server/text_generation_server/utils/convert.py
@@ -9,6 +9,7 @@ from datetime import timedelta
 from loguru import logger
 from pathlib import Path
 from safetensors.torch import load_file, save_file
+from safetensors import safe_open
 from typing import Dict, List
 
 
@@ -46,11 +47,11 @@ def remove_shared_pointers(tensors: Dict[str, torch.Tensor]):
                 tensors.pop(name)
 
 
-def convert_file(pt_file: Path, st_file: Path):
+def convert_file(pt_file: Path, sf_file: Path):
     """
     Convert a pytorch file to a safetensors file
     """
-    logger.info(f"Convert {pt_file} to {st_file}.")
+    logger.info(f"Convert {pt_file} to {sf_file}.")
 
     pt_state = torch.load(pt_file, map_location="cpu")
     if "state_dict" in pt_state:
@@ -61,28 +62,28 @@ def convert_file(pt_file: Path, st_file: Path):
     # Tensors need to be contiguous
     pt_state = {k: v.contiguous() for k, v in pt_state.items()}
 
-    st_file.parent.mkdir(parents=True, exist_ok=True)
-    save_file(pt_state, str(st_file), metadata={"format": "pt"})
+    sf_file.parent.mkdir(parents=True, exist_ok=True)
+    save_file(pt_state, str(sf_file), metadata={"format": "pt"})
 
     # Check that both files are close in size
-    check_file_size(pt_file, st_file)
+    check_file_size(pt_file, sf_file)
 
     # Load safetensors state
-    st_state = load_file(str(st_file))
-    for k in st_state:
+    for k in pt_state:
         pt_tensor = pt_state[k]
-        st_tensor = st_state[k]
-        if not torch.equal(pt_tensor, st_tensor):
-            raise RuntimeError(f"The output tensors do not match for key {k}")
+        with safe_open(sf_file, framework="pt") as f:
+            sf_tensor = f.get_tensor(k)
+            if not torch.equal(pt_tensor, sf_tensor):
+                raise RuntimeError(f"The output tensors do not match for key {k}")
 
 
-def convert_files(pt_files: List[Path], st_files: List[Path]):
-    assert len(pt_files) == len(st_files)
+def convert_files(pt_files: List[Path], sf_files: List[Path]):
+    assert len(pt_files) == len(sf_files)
 
     N = len(pt_files)
     # We do this instead of using tqdm because we want to parse the logs with the launcher
-    
-    for i, (pt_file, sf_file) in enumerate(zip(pt_files, st_files)):
+
+    for i, (pt_file, sf_file) in enumerate(zip(pt_files, sf_files)):
         start = datetime.datetime.now()
         convert_file(pt_file, sf_file)
         elapsed = datetime.datetime.now() - start

From e9b01b34333a9e2e4e0c54e440616a91a2d971f4 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 9 May 2023 12:36:02 +0200
Subject: [PATCH 16/39] fix(dockerfile): fix nvidia env vars (#297)

Fixes #291
---
 Dockerfile | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d59acc10..8810f27e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -120,6 +120,16 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     NUM_SHARD=1 \
     PORT=80
 
+# NVIDIA env vars
+ENV NVIDIA_REQUIRE_CUDA cuda>=11.8
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+# Required for nvidia-docker v1
+RUN /bin/bash -c echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
+
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 
 WORKDIR /usr/src
@@ -165,11 +175,6 @@ FROM base as sagemaker
 COPY sagemaker-entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
 
-# NVIDIA env vars
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
 ENTRYPOINT ["./entrypoint.sh"]
 
 # Final image

From 926fd9a0105a2351055c4c526851ed86d3334105 Mon Sep 17 00:00:00 2001
From: Sai Vinay G <gsaivinay@users.noreply.github.com>
Date: Tue, 9 May 2023 16:08:09 +0530
Subject: [PATCH 17/39] feat(router): Adding response schema for
 compat_generate (#292)

---
 router/src/server.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/router/src/server.rs b/router/src/server.rs
index c5fc41ef..a9baf288 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -37,7 +37,11 @@ use utoipa_swagger_ui::SwaggerUi;
     path = "/",
     request_body = CompatGenerateRequest,
     responses(
-        (status = 200, description = "See /generate or /generate_stream"),
+        (status = 200, description = "See /generate or /generate_stream",
+            content(
+                ("application/json" = GenerateResponse),
+                ("text/event-stream" = StreamResponse),
+            )),
         (status = 424, description = "Generation Error", body = ErrorResponse,
             example = json ! ({"error": "Request failed during generation"})),
         (status = 429, description = "Model is overloaded", body = ErrorResponse,

From e2502822138dfddefccde6e6a39bce22720c8cfc Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 9 May 2023 13:19:31 +0200
Subject: [PATCH 18/39] feat(docker): add benchmarking tool to docker image
 (#298)

---
 Cargo.lock                    |  143 +-
 Cargo.toml                    |   10 +-
 Dockerfile                    |    2 +
 benchmark/.gitignore          |    1 -
 benchmark/Cargo.lock          | 2884 ---------------------------------
 benchmark/Cargo.toml          |   13 +-
 benchmark/rust-toolchain.toml |    3 -
 docs/openapi.json             |   89 +-
 launcher/Cargo.toml           |    7 +-
 router/Cargo.toml             |    7 +-
 router/client/Cargo.toml      |    7 +-
 router/src/main.rs            |   34 +-
 router/src/server.rs          |    2 +-
 13 files changed, 262 insertions(+), 2940 deletions(-)
 delete mode 100644 benchmark/.gitignore
 delete mode 100644 benchmark/Cargo.lock
 delete mode 100644 benchmark/rust-toolchain.toml

diff --git a/Cargo.lock b/Cargo.lock
index 58142d4b..48595077 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -134,6 +134,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
+[[package]]
+name = "average"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843ec791d3f24503bbf72bbd5e49a3ab4dbb4bcd0a8ef6b0c908efa73caa27b1"
+dependencies = [
+ "easy-cast",
+ "float-ord",
+ "num-traits",
+]
+
 [[package]]
 name = "axum"
 version = "0.6.15"
@@ -293,6 +304,12 @@ dependencies = [
  "zip",
 ]
 
+[[package]]
+name = "cassowary"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
+
 [[package]]
 name = "cc"
 version = "1.0.79"
@@ -461,6 +478,31 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "crossterm"
+version = "0.26.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "libc",
+ "mio",
+ "parking_lot",
+ "signal-hook",
+ "signal-hook-mio",
+ "winapi",
+]
+
+[[package]]
+name = "crossterm_winapi"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "crypto-common"
 version = "0.1.6"
@@ -591,6 +633,15 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "easy-cast"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bd102ee8c418348759919b83b81cdbdc933ffe29740b903df448b4bafaa348e"
+dependencies = [
+ "libm",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -679,6 +730,12 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "float-ord"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ce81f49ae8a0482e4c55ea62ebbd7e5a686af544c00b9d090bba3ff9be97b3d"
+
 [[package]]
 name = "float_eq"
 version = "1.0.1"
@@ -1165,6 +1222,12 @@ version = "0.2.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
 
+[[package]]
+name = "libm"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.3.1"
@@ -1447,6 +1510,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "num-traits"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
 [[package]]
 name = "num_cpus"
 version = "1.15.0"
@@ -1903,6 +1976,19 @@ dependencies = [
  "getrandom",
 ]
 
+[[package]]
+name = "ratatui"
+version = "0.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcc0d032bccba900ee32151ec0265667535c230169f5a011154cdcd984e16829"
+dependencies = [
+ "bitflags",
+ "cassowary",
+ "crossterm",
+ "unicode-segmentation",
+ "unicode-width",
+]
+
 [[package]]
 name = "raw-cpuid"
 version = "10.7.0"
@@ -2252,6 +2338,27 @@ dependencies = [
  "dirs",
 ]
 
+[[package]]
+name = "signal-hook"
+version = "0.3.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9"
+dependencies = [
+ "libc",
+ "signal-hook-registry",
+]
+
+[[package]]
+name = "signal-hook-mio"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.1"
@@ -2407,9 +2514,28 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
+[[package]]
+name = "text-generation-benchmark"
+version = "0.7.0-dev"
+dependencies = [
+ "average",
+ "clap",
+ "crossterm",
+ "float-ord",
+ "ratatui",
+ "serde",
+ "serde_json",
+ "text-generation-client",
+ "thiserror",
+ "tokenizers",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "text-generation-client"
-version = "0.6.0"
+version = "0.7.0-dev"
 dependencies = [
  "futures",
  "grpc-metadata",
@@ -2421,12 +2547,11 @@ dependencies = [
  "tonic-build",
  "tower",
  "tracing",
- "tracing-error",
 ]
 
 [[package]]
 name = "text-generation-launcher"
-version = "0.6.0"
+version = "0.7.0-dev"
 dependencies = [
  "clap",
  "ctrlc",
@@ -2442,7 +2567,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "0.6.0"
+version = "0.7.0-dev"
 dependencies = [
  "async-stream",
  "axum",
@@ -2803,16 +2928,6 @@ dependencies = [
  "valuable",
 ]
 
-[[package]]
-name = "tracing-error"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e"
-dependencies = [
- "tracing",
- "tracing-subscriber",
-]
-
 [[package]]
 name = "tracing-futures"
 version = "0.2.5"
diff --git a/Cargo.toml b/Cargo.toml
index af479c04..d34b10bf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,13 +1,17 @@
 [workspace]
 members = [
+    "benchmark",
     "router",
     "router/client",
     "router/grpc-metadata",
     "launcher"
 ]
-exclude = [
-    "benchmark"
-]
+
+[workspace.package]
+version = "0.7.0-dev"
+edition = "2021"
+authors = ["Olivier Dehaene"]
+homepage = "https://github.com/huggingface/text-generation-inference"
 
 [profile.release]
 debug = 1
diff --git a/Dockerfile b/Dockerfile
index 8810f27e..7e01d42f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -164,6 +164,8 @@ RUN cd server && \
     pip install -r requirements.txt && \
     pip install ".[bnb, accelerate]" --no-cache-dir
 
+# Install benchmarker
+COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
 COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
deleted file mode 100644
index 1de56593..00000000
--- a/benchmark/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-target
\ No newline at end of file
diff --git a/benchmark/Cargo.lock b/benchmark/Cargo.lock
deleted file mode 100644
index 81c1180e..00000000
--- a/benchmark/Cargo.lock
+++ /dev/null
@@ -1,2884 +0,0 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 3
-
-[[package]]
-name = "adler"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
-
-[[package]]
-name = "aes"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8"
-dependencies = [
- "cfg-if",
- "cipher",
- "cpufeatures",
- "opaque-debug",
-]
-
-[[package]]
-name = "aho-corasick"
-version = "0.7.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
-dependencies = [
- "memchr",
-]
-
-[[package]]
-name = "anstream"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "342258dd14006105c2b75ab1bd7543a03bdf0cfc94383303ac212a04939dff6f"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-wincon",
- "concolor-override",
- "concolor-query",
- "is-terminal",
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23ea9e81bd02e310c216d080f6223c179012256e5151c41db88d12c88a1684d2"
-
-[[package]]
-name = "anstyle-parse"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7d1bb534e9efed14f3e5f44e7dd1a4f709384023a4165199a4241e18dff0116"
-dependencies = [
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle-wincon"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3127af6145b149f3287bb9a0d10ad9c5692dba8c53ad48285e5bec4063834fa"
-dependencies = [
- "anstyle",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "anyhow"
-version = "1.0.70"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4"
-
-[[package]]
-name = "async-stream"
-version = "0.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad445822218ce64be7a341abfb0b1ea43b5c23aa83902542a4542e78309d8e5e"
-dependencies = [
- "async-stream-impl",
- "futures-core",
- "pin-project-lite",
-]
-
-[[package]]
-name = "async-stream-impl"
-version = "0.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4655ae1a7b0cdf149156f780c5bf3f1352bc53cbd9e0a361a7ef7b22947e965"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "async-trait"
-version = "0.1.68"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.11",
-]
-
-[[package]]
-name = "autocfg"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
-
-[[package]]
-name = "average"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843ec791d3f24503bbf72bbd5e49a3ab4dbb4bcd0a8ef6b0c908efa73caa27b1"
-dependencies = [
- "easy-cast",
- "float-ord",
- "num-traits",
-]
-
-[[package]]
-name = "axum"
-version = "0.6.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "349f8ccfd9221ee7d1f3d4b33e1f8319b3a81ed8f61f2ea40b37b859794b4491"
-dependencies = [
- "async-trait",
- "axum-core",
- "bitflags",
- "bytes",
- "futures-util",
- "http",
- "http-body",
- "hyper",
- "itoa",
- "matchit",
- "memchr",
- "mime",
- "percent-encoding",
- "pin-project-lite",
- "rustversion",
- "serde",
- "sync_wrapper",
- "tower",
- "tower-layer",
- "tower-service",
-]
-
-[[package]]
-name = "axum-core"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2f958c80c248b34b9a877a643811be8dbca03ca5ba827f2b63baf3a81e5fc4e"
-dependencies = [
- "async-trait",
- "bytes",
- "futures-util",
- "http",
- "http-body",
- "mime",
- "rustversion",
- "tower-layer",
- "tower-service",
-]
-
-[[package]]
-name = "base64"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
-
-[[package]]
-name = "base64"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
-
-[[package]]
-name = "base64ct"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
-
-[[package]]
-name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
-[[package]]
-name = "block-buffer"
-version = "0.10.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
-dependencies = [
- "generic-array",
-]
-
-[[package]]
-name = "bumpalo"
-version = "3.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
-
-[[package]]
-name = "byteorder"
-version = "1.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
-
-[[package]]
-name = "bytes"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
-
-[[package]]
-name = "bzip2"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
-dependencies = [
- "bzip2-sys",
- "libc",
-]
-
-[[package]]
-name = "bzip2-sys"
-version = "0.1.11+1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
-
-[[package]]
-name = "cached-path"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "097968e38f1319207f057d0f4d76452e4f4f847a5de61c5215379f297fa034f3"
-dependencies = [
- "flate2",
- "fs2",
- "glob",
- "indicatif 0.16.2",
- "log",
- "rand",
- "reqwest",
- "serde",
- "serde_json",
- "sha2",
- "tar",
- "tempfile",
- "thiserror",
- "zip",
-]
-
-[[package]]
-name = "cassowary"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
-
-[[package]]
-name = "cc"
-version = "1.0.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
-dependencies = [
- "jobserver",
-]
-
-[[package]]
-name = "cfg-if"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
-
-[[package]]
-name = "cipher"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7"
-dependencies = [
- "generic-array",
-]
-
-[[package]]
-name = "clap"
-version = "4.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "046ae530c528f252094e4a77886ee1374437744b2bff1497aa898bbddbbb29b3"
-dependencies = [
- "clap_builder",
- "clap_derive",
- "once_cell",
-]
-
-[[package]]
-name = "clap_builder"
-version = "4.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "223163f58c9a40c3b0a43e1c4b50a9ce09f007ea2cb1ec258a687945b4b7929f"
-dependencies = [
- "anstream",
- "anstyle",
- "bitflags",
- "clap_lex",
- "strsim",
-]
-
-[[package]]
-name = "clap_derive"
-version = "4.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn 2.0.11",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1"
-
-[[package]]
-name = "concolor-override"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a855d4a1978dc52fb0536a04d384c2c0c1aa273597f08b77c8c4d3b2eec6037f"
-
-[[package]]
-name = "concolor-query"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88d11d52c3d7ca2e6d0040212be9e4dbbcd78b6447f535b6b561f449427944cf"
-dependencies = [
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "console"
-version = "0.15.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60"
-dependencies = [
- "encode_unicode",
- "lazy_static",
- "libc",
- "unicode-width",
- "windows-sys 0.42.0",
-]
-
-[[package]]
-name = "constant_time_eq"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
-
-[[package]]
-name = "core-foundation"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
-[[package]]
-name = "core-foundation-sys"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
-
-[[package]]
-name = "cpufeatures"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "crc32fast"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "crossbeam-channel"
-version = "0.5.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf2b3e8478797446514c91ef04bafcb59faba183e621ad488df88983cc14128c"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-deque"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
-dependencies = [
- "cfg-if",
- "crossbeam-epoch",
- "crossbeam-utils",
-]
-
-[[package]]
-name = "crossbeam-epoch"
-version = "0.9.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
-dependencies = [
- "autocfg",
- "cfg-if",
- "crossbeam-utils",
- "memoffset",
- "scopeguard",
-]
-
-[[package]]
-name = "crossbeam-utils"
-version = "0.8.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "crossterm"
-version = "0.26.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13"
-dependencies = [
- "bitflags",
- "crossterm_winapi",
- "libc",
- "mio",
- "parking_lot",
- "signal-hook",
- "signal-hook-mio",
- "winapi",
-]
-
-[[package]]
-name = "crossterm_winapi"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "crypto-common"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
-dependencies = [
- "generic-array",
- "typenum",
-]
-
-[[package]]
-name = "darling"
-version = "0.14.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
-dependencies = [
- "darling_core",
- "darling_macro",
-]
-
-[[package]]
-name = "darling_core"
-version = "0.14.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
-dependencies = [
- "fnv",
- "ident_case",
- "proc-macro2",
- "quote",
- "strsim",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "darling_macro"
-version = "0.14.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
-dependencies = [
- "darling_core",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "dashmap"
-version = "5.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
-dependencies = [
- "cfg-if",
- "hashbrown",
- "lock_api",
- "once_cell",
- "parking_lot_core",
-]
-
-[[package]]
-name = "derive_builder"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8"
-dependencies = [
- "derive_builder_macro",
-]
-
-[[package]]
-name = "derive_builder_core"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
-dependencies = [
- "darling",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "derive_builder_macro"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
-dependencies = [
- "derive_builder_core",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "digest"
-version = "0.10.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
-dependencies = [
- "block-buffer",
- "crypto-common",
- "subtle",
-]
-
-[[package]]
-name = "dirs"
-version = "4.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059"
-dependencies = [
- "dirs-sys",
-]
-
-[[package]]
-name = "dirs-sys"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
-dependencies = [
- "libc",
- "redox_users",
- "winapi",
-]
-
-[[package]]
-name = "easy-cast"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bd102ee8c418348759919b83b81cdbdc933ffe29740b903df448b4bafaa348e"
-dependencies = [
- "libm",
-]
-
-[[package]]
-name = "either"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
-
-[[package]]
-name = "encode_unicode"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
-
-[[package]]
-name = "encoding_rs"
-version = "0.8.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "errno"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d6a0976c999d473fe89ad888d5a284e55366d9dc9038b1ba2aa15128c4afa0"
-dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
- "libc",
-]
-
-[[package]]
-name = "esaxx-rs"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35"
-dependencies = [
- "cc",
-]
-
-[[package]]
-name = "fastrand"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
-
-[[package]]
-name = "filetime"
-version = "0.2.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a3de6e8d11b22ff9edc6d916f890800597d60f8b2da1caf2955c274638d6412"
-dependencies = [
- "cfg-if",
- "libc",
- "redox_syscall 0.2.16",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "fixedbitset"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
-
-[[package]]
-name = "flate2"
-version = "1.0.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841"
-dependencies = [
- "crc32fast",
- "miniz_oxide",
-]
-
-[[package]]
-name = "float-ord"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce81f49ae8a0482e4c55ea62ebbd7e5a686af544c00b9d090bba3ff9be97b3d"
-
-[[package]]
-name = "fnv"
-version = "1.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
-
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
-[[package]]
-name = "form_urlencoded"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
-dependencies = [
- "percent-encoding",
-]
-
-[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "futures"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "531ac96c6ff5fd7c62263c5e3c67a603af4fcaee2e1a0ae5565ba3a11e69e549"
-dependencies = [
- "futures-channel",
- "futures-core",
- "futures-executor",
- "futures-io",
- "futures-sink",
- "futures-task",
- "futures-util",
-]
-
-[[package]]
-name = "futures-channel"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "164713a5a0dcc3e7b4b1ed7d3b433cabc18025386f9339346e8daf15963cf7ac"
-dependencies = [
- "futures-core",
- "futures-sink",
-]
-
-[[package]]
-name = "futures-core"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86d7a0c1aa76363dac491de0ee99faf6941128376f1cf96f07db7603b7de69dd"
-
-[[package]]
-name = "futures-executor"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1997dd9df74cdac935c76252744c1ed5794fac083242ea4fe77ef3ed60ba0f83"
-dependencies = [
- "futures-core",
- "futures-task",
- "futures-util",
-]
-
-[[package]]
-name = "futures-io"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d422fa3cbe3b40dca574ab087abb5bc98258ea57eea3fd6f1fa7162c778b91"
-
-[[package]]
-name = "futures-macro"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3eb14ed937631bd8b8b8977f2c198443447a8355b6e3ca599f38c975e5a963b6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "futures-sink"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec93083a4aecafb2a80a885c9de1f0ccae9dbd32c2bb54b0c3a65690e0b8d2f2"
-
-[[package]]
-name = "futures-task"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd65540d33b37b16542a0438c12e6aeead10d4ac5d05bd3f805b8f35ab592879"
-
-[[package]]
-name = "futures-util"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ef6b17e481503ec85211fed8f39d1970f128935ca1f814cd32ac4a6842e84ab"
-dependencies = [
- "futures-channel",
- "futures-core",
- "futures-io",
- "futures-macro",
- "futures-sink",
- "futures-task",
- "memchr",
- "pin-project-lite",
- "pin-utils",
- "slab",
-]
-
-[[package]]
-name = "generic-array"
-version = "0.14.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
-dependencies = [
- "typenum",
- "version_check",
-]
-
-[[package]]
-name = "getrandom"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
-dependencies = [
- "cfg-if",
- "libc",
- "wasi",
-]
-
-[[package]]
-name = "glob"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
-
-[[package]]
-name = "grpc-metadata"
-version = "0.1.0"
-dependencies = [
- "opentelemetry",
- "tonic",
- "tracing",
- "tracing-opentelemetry",
-]
-
-[[package]]
-name = "h2"
-version = "0.3.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5be7b54589b581f624f566bf5d8eb2bab1db736c51528720b6bd36b96b55924d"
-dependencies = [
- "bytes",
- "fnv",
- "futures-core",
- "futures-sink",
- "futures-util",
- "http",
- "indexmap",
- "slab",
- "tokio",
- "tokio-util",
- "tracing",
-]
-
-[[package]]
-name = "hashbrown"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-
-[[package]]
-name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
-[[package]]
-name = "hermit-abi"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
-
-[[package]]
-name = "hmac"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
-dependencies = [
- "digest",
-]
-
-[[package]]
-name = "http"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482"
-dependencies = [
- "bytes",
- "fnv",
- "itoa",
-]
-
-[[package]]
-name = "http-body"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1"
-dependencies = [
- "bytes",
- "http",
- "pin-project-lite",
-]
-
-[[package]]
-name = "httparse"
-version = "1.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
-
-[[package]]
-name = "httpdate"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421"
-
-[[package]]
-name = "hyper"
-version = "0.14.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc5e554ff619822309ffd57d8734d77cd5ce6238bc956f037ea06c58238c9899"
-dependencies = [
- "bytes",
- "futures-channel",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "httparse",
- "httpdate",
- "itoa",
- "pin-project-lite",
- "socket2",
- "tokio",
- "tower-service",
- "tracing",
- "want",
-]
-
-[[package]]
-name = "hyper-timeout"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
-dependencies = [
- "hyper",
- "pin-project-lite",
- "tokio",
- "tokio-io-timeout",
-]
-
-[[package]]
-name = "hyper-tls"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
-dependencies = [
- "bytes",
- "hyper",
- "native-tls",
- "tokio",
- "tokio-native-tls",
-]
-
-[[package]]
-name = "ident_case"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
-
-[[package]]
-name = "idna"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
-dependencies = [
- "unicode-bidi",
- "unicode-normalization",
-]
-
-[[package]]
-name = "indexmap"
-version = "1.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
-dependencies = [
- "autocfg",
- "hashbrown",
-]
-
-[[package]]
-name = "indicatif"
-version = "0.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4"
-dependencies = [
- "console",
- "lazy_static",
- "number_prefix 0.3.0",
- "regex",
-]
-
-[[package]]
-name = "indicatif"
-version = "0.16.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b"
-dependencies = [
- "console",
- "lazy_static",
- "number_prefix 0.4.0",
- "regex",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "io-lifetimes"
-version = "1.0.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb"
-dependencies = [
- "hermit-abi 0.3.1",
- "libc",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "ipnet"
-version = "2.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f"
-
-[[package]]
-name = "is-terminal"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "256017f749ab3117e93acb91063009e1f1bb56d03965b14c2c8df4eb02c524d8"
-dependencies = [
- "hermit-abi 0.3.1",
- "io-lifetimes",
- "rustix",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "itertools"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
-dependencies = [
- "either",
-]
-
-[[package]]
-name = "itertools"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
-dependencies = [
- "either",
-]
-
-[[package]]
-name = "itertools"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
-dependencies = [
- "either",
-]
-
-[[package]]
-name = "itoa"
-version = "1.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
-
-[[package]]
-name = "jobserver"
-version = "0.1.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "js-sys"
-version = "0.3.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
-dependencies = [
- "wasm-bindgen",
-]
-
-[[package]]
-name = "lazy_static"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-
-[[package]]
-name = "libc"
-version = "0.2.140"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c"
-
-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd550e73688e6d578f0ac2119e32b797a327631a42f9433e59d02e139c8df60d"
-
-[[package]]
-name = "lock_api"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
-dependencies = [
- "autocfg",
- "scopeguard",
-]
-
-[[package]]
-name = "log"
-version = "0.4.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "macro_rules_attribute"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862"
-dependencies = [
- "macro_rules_attribute-proc_macro",
- "paste",
-]
-
-[[package]]
-name = "macro_rules_attribute-proc_macro"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
-
-[[package]]
-name = "matchers"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
-dependencies = [
- "regex-automata",
-]
-
-[[package]]
-name = "matchit"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
-
-[[package]]
-name = "memchr"
-version = "2.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
-
-[[package]]
-name = "memoffset"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "mime"
-version = "0.3.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
-
-[[package]]
-name = "minimal-lexical"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
-
-[[package]]
-name = "miniz_oxide"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
-dependencies = [
- "adler",
-]
-
-[[package]]
-name = "mio"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
-dependencies = [
- "libc",
- "log",
- "wasi",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "monostate"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
-dependencies = [
- "monostate-impl",
- "serde",
-]
-
-[[package]]
-name = "monostate-impl"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.11",
-]
-
-[[package]]
-name = "multimap"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
-
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
-[[package]]
-name = "nom"
-version = "7.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
-dependencies = [
- "memchr",
- "minimal-lexical",
-]
-
-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
-[[package]]
-name = "num-traits"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
-dependencies = [
- "autocfg",
- "libm",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
-dependencies = [
- "hermit-abi 0.2.6",
- "libc",
-]
-
-[[package]]
-name = "number_prefix"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
-
-[[package]]
-name = "number_prefix"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
-
-[[package]]
-name = "once_cell"
-version = "1.17.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
-
-[[package]]
-name = "onig"
-version = "6.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
-dependencies = [
- "bitflags",
- "libc",
- "once_cell",
- "onig_sys",
-]
-
-[[package]]
-name = "onig_sys"
-version = "69.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7"
-dependencies = [
- "cc",
- "pkg-config",
-]
-
-[[package]]
-name = "opaque-debug"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
-
-[[package]]
-name = "openssl"
-version = "0.10.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "518915b97df115dd36109bfa429a48b8f737bd05508cf9588977b599648926d2"
-dependencies = [
- "bitflags",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "openssl-probe"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
-
-[[package]]
-name = "openssl-sys"
-version = "0.9.83"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "666416d899cf077260dac8698d60a60b435a46d57e82acb1be3d0dad87284e5b"
-dependencies = [
- "autocfg",
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
-[[package]]
-name = "opentelemetry"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
-dependencies = [
- "opentelemetry_api",
- "opentelemetry_sdk",
-]
-
-[[package]]
-name = "opentelemetry_api"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
-dependencies = [
- "fnv",
- "futures-channel",
- "futures-util",
- "indexmap",
- "js-sys",
- "once_cell",
- "pin-project-lite",
- "thiserror",
-]
-
-[[package]]
-name = "opentelemetry_sdk"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
-dependencies = [
- "async-trait",
- "crossbeam-channel",
- "dashmap",
- "fnv",
- "futures-channel",
- "futures-executor",
- "futures-util",
- "once_cell",
- "opentelemetry_api",
- "percent-encoding",
- "rand",
- "thiserror",
-]
-
-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
-[[package]]
-name = "parking_lot"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
-dependencies = [
- "lock_api",
- "parking_lot_core",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.9.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
-dependencies = [
- "cfg-if",
- "libc",
- "redox_syscall 0.2.16",
- "smallvec",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "password-hash"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
-dependencies = [
- "base64ct",
- "rand_core",
- "subtle",
-]
-
-[[package]]
-name = "paste"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
-
-[[package]]
-name = "pbkdf2"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
-dependencies = [
- "digest",
- "hmac",
- "password-hash",
- "sha2",
-]
-
-[[package]]
-name = "percent-encoding"
-version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
-
-[[package]]
-name = "petgraph"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
-dependencies = [
- "fixedbitset",
- "indexmap",
-]
-
-[[package]]
-name = "pin-project"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc"
-dependencies = [
- "pin-project-internal",
-]
-
-[[package]]
-name = "pin-project-internal"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "pin-project-lite"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
-
-[[package]]
-name = "pin-utils"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
-
-[[package]]
-name = "pkg-config"
-version = "0.3.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
-
-[[package]]
-name = "ppv-lite86"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
-
-[[package]]
-name = "prettyplease"
-version = "0.1.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86"
-dependencies = [
- "proc-macro2",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.54"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e472a104799c74b514a57226160104aa483546de37e839ec50e3c2e41dd87534"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "prost"
-version = "0.11.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e48e50df39172a3e7eb17e14642445da64996989bc212b583015435d39a58537"
-dependencies = [
- "bytes",
- "prost-derive",
-]
-
-[[package]]
-name = "prost-build"
-version = "0.11.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c828f93f5ca4826f97fedcbd3f9a536c16b12cff3dbbb4a007f932bbad95b12"
-dependencies = [
- "bytes",
- "heck",
- "itertools 0.10.5",
- "lazy_static",
- "log",
- "multimap",
- "petgraph",
- "prettyplease",
- "prost",
- "prost-types",
- "regex",
- "syn 1.0.109",
- "tempfile",
- "which",
-]
-
-[[package]]
-name = "prost-derive"
-version = "0.11.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ea9b0f8cbe5e15a8a042d030bd96668db28ecb567ec37d691971ff5731d2b1b"
-dependencies = [
- "anyhow",
- "itertools 0.10.5",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "prost-types"
-version = "0.11.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "379119666929a1afd7a043aa6cf96fa67a6dce9af60c88095a4686dbce4c9c88"
-dependencies = [
- "prost",
-]
-
-[[package]]
-name = "quote"
-version = "1.0.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
-dependencies = [
- "proc-macro2",
-]
-
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha",
- "rand_core",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom",
-]
-
-[[package]]
-name = "ratatui"
-version = "0.20.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcc0d032bccba900ee32151ec0265667535c230169f5a011154cdcd984e16829"
-dependencies = [
- "bitflags",
- "cassowary",
- "crossterm",
- "unicode-segmentation",
- "unicode-width",
-]
-
-[[package]]
-name = "rayon"
-version = "1.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
-dependencies = [
- "either",
- "rayon-core",
-]
-
-[[package]]
-name = "rayon-cond"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd1259362c9065e5ea39a789ef40b1e3fd934c94beb7b5ab3ac6629d3b5e7cb7"
-dependencies = [
- "either",
- "itertools 0.8.2",
- "rayon",
-]
-
-[[package]]
-name = "rayon-core"
-version = "1.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
-dependencies = [
- "crossbeam-channel",
- "crossbeam-deque",
- "crossbeam-utils",
- "num_cpus",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
-dependencies = [
- "bitflags",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags",
-]
-
-[[package]]
-name = "redox_users"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
-dependencies = [
- "getrandom",
- "redox_syscall 0.2.16",
- "thiserror",
-]
-
-[[package]]
-name = "regex"
-version = "1.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
-dependencies = [
- "aho-corasick",
- "memchr",
- "regex-syntax",
-]
-
-[[package]]
-name = "regex-automata"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
-dependencies = [
- "regex-syntax",
-]
-
-[[package]]
-name = "regex-syntax"
-version = "0.6.29"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
-
-[[package]]
-name = "reqwest"
-version = "0.11.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254"
-dependencies = [
- "base64 0.21.0",
- "bytes",
- "encoding_rs",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-tls",
- "ipnet",
- "js-sys",
- "log",
- "mime",
- "native-tls",
- "once_cell",
- "percent-encoding",
- "pin-project-lite",
- "serde",
- "serde_json",
- "serde_urlencoded",
- "tokio",
- "tokio-native-tls",
- "tower-service",
- "url",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "web-sys",
- "winreg",
-]
-
-[[package]]
-name = "rustix"
-version = "0.37.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e78cc525325c06b4a7ff02db283472f3c042b7ff0c391f96c6d5ac6f4f91b75"
-dependencies = [
- "bitflags",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "rustversion"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06"
-
-[[package]]
-name = "ryu"
-version = "1.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
-
-[[package]]
-name = "schannel"
-version = "0.1.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
-dependencies = [
- "windows-sys 0.42.0",
-]
-
-[[package]]
-name = "scopeguard"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
-
-[[package]]
-name = "security-framework"
-version = "2.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254"
-dependencies = [
- "bitflags",
- "core-foundation",
- "core-foundation-sys",
- "libc",
- "security-framework-sys",
-]
-
-[[package]]
-name = "security-framework-sys"
-version = "2.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
-[[package]]
-name = "serde"
-version = "1.0.159"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
-dependencies = [
- "serde_derive",
-]
-
-[[package]]
-name = "serde_derive"
-version = "1.0.159"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.11",
-]
-
-[[package]]
-name = "serde_json"
-version = "1.0.95"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
-dependencies = [
- "itoa",
- "ryu",
- "serde",
-]
-
-[[package]]
-name = "serde_urlencoded"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
-dependencies = [
- "form_urlencoded",
- "itoa",
- "ryu",
- "serde",
-]
-
-[[package]]
-name = "sha1"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "digest",
-]
-
-[[package]]
-name = "sha2"
-version = "0.10.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "digest",
-]
-
-[[package]]
-name = "sharded-slab"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
-dependencies = [
- "lazy_static",
-]
-
-[[package]]
-name = "signal-hook"
-version = "0.3.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9"
-dependencies = [
- "libc",
- "signal-hook-registry",
-]
-
-[[package]]
-name = "signal-hook-mio"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
-dependencies = [
- "libc",
- "mio",
- "signal-hook",
-]
-
-[[package]]
-name = "signal-hook-registry"
-version = "1.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "slab"
-version = "0.4.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "smallvec"
-version = "1.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
-
-[[package]]
-name = "socket2"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
-dependencies = [
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "spm_precompiled"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326"
-dependencies = [
- "base64 0.13.1",
- "nom",
- "serde",
- "unicode-segmentation",
-]
-
-[[package]]
-name = "strsim"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
-
-[[package]]
-name = "subtle"
-version = "2.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
-
-[[package]]
-name = "syn"
-version = "1.0.109"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "syn"
-version = "2.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21e3787bb71465627110e7d87ed4faaa36c1f61042ee67badb9e2ef173accc40"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "sync_wrapper"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
-
-[[package]]
-name = "tar"
-version = "0.4.38"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
-dependencies = [
- "filetime",
- "libc",
- "xattr",
-]
-
-[[package]]
-name = "tempfile"
-version = "3.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
-dependencies = [
- "cfg-if",
- "fastrand",
- "redox_syscall 0.3.5",
- "rustix",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "text-generation-benchmark"
-version = "0.1.0"
-dependencies = [
- "average",
- "clap",
- "crossterm",
- "float-ord",
- "ratatui",
- "serde",
- "serde_json",
- "text-generation-client",
- "thiserror",
- "tokenizers",
- "tokio",
- "tracing",
- "tracing-subscriber",
-]
-
-[[package]]
-name = "text-generation-client"
-version = "0.6.0"
-dependencies = [
- "futures",
- "grpc-metadata",
- "prost",
- "prost-build",
- "thiserror",
- "tokio",
- "tonic",
- "tonic-build",
- "tower",
- "tracing",
- "tracing-error",
-]
-
-[[package]]
-name = "thiserror"
-version = "1.0.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
-dependencies = [
- "thiserror-impl",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.11",
-]
-
-[[package]]
-name = "thread_local"
-version = "1.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
-dependencies = [
- "cfg-if",
- "once_cell",
-]
-
-[[package]]
-name = "time"
-version = "0.3.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890"
-dependencies = [
- "serde",
- "time-core",
-]
-
-[[package]]
-name = "time-core"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd"
-
-[[package]]
-name = "tinyvec"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
-dependencies = [
- "tinyvec_macros",
-]
-
-[[package]]
-name = "tinyvec_macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
-
-[[package]]
-name = "tokenizers"
-version = "0.13.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
-dependencies = [
- "aho-corasick",
- "cached-path",
- "clap",
- "derive_builder",
- "dirs",
- "esaxx-rs",
- "getrandom",
- "indicatif 0.15.0",
- "itertools 0.9.0",
- "lazy_static",
- "log",
- "macro_rules_attribute",
- "monostate",
- "onig",
- "paste",
- "rand",
- "rayon",
- "rayon-cond",
- "regex",
- "regex-syntax",
- "reqwest",
- "serde",
- "serde_json",
- "spm_precompiled",
- "thiserror",
- "unicode-normalization-alignments",
- "unicode-segmentation",
- "unicode_categories",
-]
-
-[[package]]
-name = "tokio"
-version = "1.27.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001"
-dependencies = [
- "autocfg",
- "bytes",
- "libc",
- "mio",
- "num_cpus",
- "parking_lot",
- "pin-project-lite",
- "signal-hook-registry",
- "socket2",
- "tokio-macros",
- "windows-sys 0.45.0",
-]
-
-[[package]]
-name = "tokio-io-timeout"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
-dependencies = [
- "pin-project-lite",
- "tokio",
-]
-
-[[package]]
-name = "tokio-macros"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.11",
-]
-
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
-[[package]]
-name = "tokio-stream"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313"
-dependencies = [
- "futures-core",
- "pin-project-lite",
- "tokio",
-]
-
-[[package]]
-name = "tokio-util"
-version = "0.7.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2"
-dependencies = [
- "bytes",
- "futures-core",
- "futures-sink",
- "pin-project-lite",
- "tokio",
- "tracing",
-]
-
-[[package]]
-name = "tonic"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.13.1",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost",
- "prost-derive",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
-]
-
-[[package]]
-name = "tonic-build"
-version = "0.8.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
-dependencies = [
- "prettyplease",
- "proc-macro2",
- "prost-build",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "tower"
-version = "0.4.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
-dependencies = [
- "futures-core",
- "futures-util",
- "indexmap",
- "pin-project",
- "pin-project-lite",
- "rand",
- "slab",
- "tokio",
- "tokio-util",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
-[[package]]
-name = "tower-layer"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
-
-[[package]]
-name = "tower-service"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
-
-[[package]]
-name = "tracing"
-version = "0.1.37"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
-dependencies = [
- "cfg-if",
- "log",
- "pin-project-lite",
- "tracing-attributes",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-attributes"
-version = "0.1.23"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "tracing-core"
-version = "0.1.30"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
-dependencies = [
- "once_cell",
- "valuable",
-]
-
-[[package]]
-name = "tracing-error"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e"
-dependencies = [
- "tracing",
- "tracing-subscriber",
-]
-
-[[package]]
-name = "tracing-futures"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
-dependencies = [
- "pin-project",
- "tracing",
-]
-
-[[package]]
-name = "tracing-log"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922"
-dependencies = [
- "lazy_static",
- "log",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-opentelemetry"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
-dependencies = [
- "once_cell",
- "opentelemetry",
- "tracing",
- "tracing-core",
- "tracing-log",
- "tracing-subscriber",
-]
-
-[[package]]
-name = "tracing-serde"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
-dependencies = [
- "serde",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-subscriber"
-version = "0.3.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70"
-dependencies = [
- "matchers",
- "nu-ansi-term",
- "once_cell",
- "regex",
- "serde",
- "serde_json",
- "sharded-slab",
- "smallvec",
- "thread_local",
- "tracing",
- "tracing-core",
- "tracing-log",
- "tracing-serde",
-]
-
-[[package]]
-name = "try-lock"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
-
-[[package]]
-name = "typenum"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
-
-[[package]]
-name = "unicode-bidi"
-version = "0.3.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
-
-[[package]]
-name = "unicode-ident"
-version = "1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
-
-[[package]]
-name = "unicode-normalization"
-version = "0.1.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
-dependencies = [
- "tinyvec",
-]
-
-[[package]]
-name = "unicode-normalization-alignments"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de"
-dependencies = [
- "smallvec",
-]
-
-[[package]]
-name = "unicode-segmentation"
-version = "1.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
-
-[[package]]
-name = "unicode-width"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
-
-[[package]]
-name = "unicode_categories"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
-
-[[package]]
-name = "url"
-version = "2.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643"
-dependencies = [
- "form_urlencoded",
- "idna",
- "percent-encoding",
-]
-
-[[package]]
-name = "utf8parse"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
-
-[[package]]
-name = "valuable"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
-
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
-[[package]]
-name = "version_check"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-
-[[package]]
-name = "want"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0"
-dependencies = [
- "log",
- "try-lock",
-]
-
-[[package]]
-name = "wasi"
-version = "0.11.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-futures"
-version = "0.4.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454"
-dependencies = [
- "cfg-if",
- "js-sys",
- "wasm-bindgen",
- "web-sys",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
-
-[[package]]
-name = "web-sys"
-version = "0.3.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97"
-dependencies = [
- "js-sys",
- "wasm-bindgen",
-]
-
-[[package]]
-name = "which"
-version = "4.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
-dependencies = [
- "either",
- "libc",
- "once_cell",
-]
-
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.45.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
-dependencies = [
- "windows-targets",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
-dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
-]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
-[[package]]
-name = "winreg"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "xattr"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "zip"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0445d0fbc924bb93539b4316c11afb121ea39296f99a3c4c9edad09e3658cdef"
-dependencies = [
- "aes",
- "byteorder",
- "bzip2",
- "constant_time_eq",
- "crc32fast",
- "crossbeam-utils",
- "flate2",
- "hmac",
- "pbkdf2",
- "sha1",
- "time",
- "zstd",
-]
-
-[[package]]
-name = "zstd"
-version = "0.11.2+zstd.1.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
-dependencies = [
- "zstd-safe",
-]
-
-[[package]]
-name = "zstd-safe"
-version = "5.0.2+zstd.1.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
-dependencies = [
- "libc",
- "zstd-sys",
-]
-
-[[package]]
-name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml
index 4a6651c8..a4e215fc 100644
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@@ -1,15 +1,10 @@
 [package]
 name = "text-generation-benchmark"
-version = "0.1.0"
-edition = "2021"
-authors = ["Olivier Dehaene"]
 description = "Text Generation Benchmarking tool"
-
-[profile.release]
-debug = 1
-incremental = true
-lto = "off"
-panic = "abort"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
 
 [lib]
 path = "src/lib.rs"
diff --git a/benchmark/rust-toolchain.toml b/benchmark/rust-toolchain.toml
deleted file mode 100644
index dcdb06e9..00000000
--- a/benchmark/rust-toolchain.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[toolchain]
-channel = "1.67.0"
-components = ["rustfmt", "clippy"]
\ No newline at end of file
diff --git a/docs/openapi.json b/docs/openapi.json
index e7b7b140..fbac7786 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "0.6.0"
+    "version": "0.7.0-dev"
   },
   "paths": {
     "/": {
@@ -33,7 +33,19 @@
         },
         "responses": {
           "200": {
-            "description": "See /generate or /generate_stream"
+            "description": "Generated Text",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/GenerateResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/StreamResponse"
+                }
+              }
+            }
           },
           "422": {
             "description": "Input validation error",
@@ -584,11 +596,73 @@
         "type": "object",
         "required": [
           "model_id",
+          "model_dtype",
+          "model_device_type",
+          "max_concurrent_requests",
+          "max_best_of",
+          "max_stop_sequences",
+          "max_input_length",
+          "max_total_tokens",
+          "waiting_served_ratio",
+          "max_batch_total_tokens",
+          "max_waiting_tokens",
+          "validation_workers",
           "version"
         ],
         "properties": {
+          "docker_label": {
+            "type": "string",
+            "example": "null",
+            "nullable": true
+          },
+          "max_batch_total_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "example": "32000",
+            "minimum": 0.0
+          },
+          "max_best_of": {
+            "type": "integer",
+            "example": "2",
+            "minimum": 0.0
+          },
+          "max_concurrent_requests": {
+            "type": "integer",
+            "description": "Router Parameters",
+            "example": "128",
+            "minimum": 0.0
+          },
+          "max_input_length": {
+            "type": "integer",
+            "example": "1024",
+            "minimum": 0.0
+          },
+          "max_stop_sequences": {
+            "type": "integer",
+            "example": "4",
+            "minimum": 0.0
+          },
+          "max_total_tokens": {
+            "type": "integer",
+            "example": "2048",
+            "minimum": 0.0
+          },
+          "max_waiting_tokens": {
+            "type": "integer",
+            "example": "20",
+            "minimum": 0.0
+          },
+          "model_device_type": {
+            "type": "string",
+            "example": "cuda"
+          },
+          "model_dtype": {
+            "type": "string",
+            "example": "torch.float16"
+          },
           "model_id": {
             "type": "string",
+            "description": "Model info",
             "example": "bigscience/blomm-560m"
           },
           "model_pipeline_tag": {
@@ -606,9 +680,20 @@
             "example": "null",
             "nullable": true
           },
+          "validation_workers": {
+            "type": "integer",
+            "example": "2",
+            "minimum": 0.0
+          },
           "version": {
             "type": "string",
+            "description": "Router Info",
             "example": "0.5.0"
+          },
+          "waiting_served_ratio": {
+            "type": "number",
+            "format": "float",
+            "example": "1.2"
           }
         }
       },
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index 885dcd98..6439e960 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -1,9 +1,10 @@
 [package]
 name = "text-generation-launcher"
-version = "0.6.0"
-edition = "2021"
-authors = ["Olivier Dehaene"]
 description = "Text Generation Launcher"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
 
 [dependencies]
 clap = { version = "4.1.4", features = ["derive", "env"] }
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 4fa523a5..6503e1bd 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -1,10 +1,11 @@
 [package]
 name = "text-generation-router"
-version = "0.6.0"
-edition = "2021"
-authors = ["Olivier Dehaene"]
 description = "Text Generation Webserver"
 build = "build.rs"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
 
 [lib]
 path = "src/lib.rs"
diff --git a/router/client/Cargo.toml b/router/client/Cargo.toml
index b32402a5..db7245e0 100644
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@@ -1,7 +1,9 @@
 [package]
 name = "text-generation-client"
-version = "0.6.0"
-edition = "2021"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
 
 [dependencies]
 futures = "^0.3"
@@ -12,7 +14,6 @@ tokio = { version = "^1.25", features = ["sync"] }
 tonic = "^0.8"
 tower = "^0.4"
 tracing = "^0.1"
-tracing-error = "^0.2"
 
 [build-dependencies]
 tonic-build = "0.8.4"
diff --git a/router/src/main.rs b/router/src/main.rs
index 50932cd1..5ad49003 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -9,6 +9,7 @@ use opentelemetry::{global, KeyValue};
 use opentelemetry_otlp::WithExportConfig;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::Path;
+use std::time::Duration;
 use text_generation_client::ShardedClient;
 use text_generation_router::{server, HubModelInfo};
 use tokenizers::{FromPretrainedParameters, Tokenizer};
@@ -125,7 +126,7 @@ fn main() -> Result<(), std::io::Error> {
         .block_on(async {
             init_logging(otlp_endpoint, json_output);
 
-            if let Some(max_batch_size) = max_batch_size{
+            if let Some(max_batch_size) = max_batch_size {
                 tracing::warn!("`max-batch-size` is deprecated. Use `max-batch-total-tokens` instead");
                 max_batch_total_tokens = (max_batch_size * max_total_tokens) as u32;
                 tracing::warn!("Overriding `max-batch-total-tokens` value with `max-batch-size` * `max-total-tokens` = {max_batch_total_tokens}");
@@ -145,7 +146,10 @@ fn main() -> Result<(), std::io::Error> {
                     sha: None,
                     pipeline_tag: None,
                 },
-                false => get_model_info(&tokenizer_name, &revision, authorization_token).await,
+                false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or({
+                    tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
+                    HubModelInfo { model_id: tokenizer_name.to_string(), sha: None, pipeline_tag: None }
+                }),
             };
 
             // if pipeline-tag == text-generation we default to return_full_text = true
@@ -195,7 +199,7 @@ fn main() -> Result<(), std::io::Error> {
                 addr,
                 cors_allow_origin,
             )
-            .await;
+                .await;
             Ok(())
         })
 }
@@ -256,22 +260,24 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
 }
 
 /// get model info from the Huggingface Hub
-pub async fn get_model_info(model_id: &str, revision: &str, token: Option<String>) -> HubModelInfo {
+pub async fn get_model_info(
+    model_id: &str,
+    revision: &str,
+    token: Option<String>,
+) -> Option<HubModelInfo> {
     let client = reqwest::Client::new();
     // Poor man's urlencode
-    let revision = revision.replace("/", "%2F");
+    let revision = revision.replace('/', "%2F");
     let url = format!("https://huggingface.co/api/models/{model_id}/revision/{revision}");
-    let mut builder = client.get(url);
+    let mut builder = client.get(url).timeout(Duration::from_secs(5));
     if let Some(token) = token {
         builder = builder.bearer_auth(token);
     }
 
-    let model_info = builder
-        .send()
-        .await
-        .expect("Could not connect to hf.co")
-        .text()
-        .await
-        .expect("error when retrieving model info from hf.co");
-    serde_json::from_str(&model_info).expect("unable to parse model info")
+    let response = builder.send().await.ok()?;
+
+    if response.status().is_success() {
+        return serde_json::from_str(&response.text().await.ok()?).ok();
+    }
+    None
 }
diff --git a/router/src/server.rs b/router/src/server.rs
index a9baf288..162b6fd9 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -37,7 +37,7 @@ use utoipa_swagger_ui::SwaggerUi;
     path = "/",
     request_body = CompatGenerateRequest,
     responses(
-        (status = 200, description = "See /generate or /generate_stream",
+        (status = 200, description = "Generated Text",
             content(
                 ("application/json" = GenerateResponse),
                 ("text/event-stream" = StreamResponse),

From bc5c07231ec6284b94a8ff78a54d6da4b505a93f Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 9 May 2023 14:39:59 +0200
Subject: [PATCH 19/39] fix(docker): fix docker build (#299)

---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 7e01d42f..d68703cb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,6 +6,7 @@ FROM chef as planner
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
+COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json
@@ -27,6 +28,7 @@ RUN cargo chef cook --release --recipe-path recipe.json
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
+COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
 RUN cargo build --release

From ad66f6ef9ac3677e6259b85026f911a555970801 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 9 May 2023 18:26:19 +0200
Subject: [PATCH 20/39] feat(server): optim flash causal lm decode_token (#285)

---
 .../custom_modeling/flash_llama_modeling.py   |   7 +-
 .../custom_modeling/flash_neox_modeling.py    |   7 +-
 .../flash_santacoder_modeling.py              |   7 +-
 .../models/flash_causal_lm.py                 | 377 ++++++++++++------
 .../models/flash_llama.py                     |   4 +-
 .../models/flash_neox.py                      |   2 +-
 .../models/flash_santacoder.py                |   4 +-
 7 files changed, 263 insertions(+), 145 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index de9b22da..1293124a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -554,6 +554,7 @@ class FlashLlamaModel(torch.nn.Module):
         input_ids,
         position_ids,
         cu_seqlens,
+        cu_seqlens_q,
         max_s,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
@@ -575,15 +576,11 @@ class FlashLlamaModel(torch.nn.Module):
                 )
             )
             layer_past_present_indices = None
-            cu_seqlens_q = None
             slice_past_index = len(hidden_states)
         # Decode
         else:
             # Create indices from cumulative sequence lengths
             layer_past_present_indices = cu_seqlens[1:] - 1
-            cu_seqlens_q = torch.arange(
-                cu_seqlens.shape[0], dtype=torch.int32, device=hidden_states.device
-            )
             slice_past_index = None
 
         # Get rotary cos and sin for this forward
@@ -650,6 +647,7 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         input_ids,
         position_ids,
         cu_seqlens,
+        cu_seqlens_q,
         max_s,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
@@ -658,6 +656,7 @@ class FlashLlamaForCausalLM(torch.nn.Module):
             input_ids,
             position_ids,
             cu_seqlens,
+            cu_seqlens_q,
             max_s,
             past_key_values,
             pre_allocate_past_size,
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index cc9b292f..ae1465ab 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -617,6 +617,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
         input_ids,
         position_ids,
         cu_seqlens,
+        cu_seqlens_q,
         max_s,
         past_key_values=None,
         pre_allocate_past_size: Optional[int] = None,
@@ -638,15 +639,11 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
                 )
             )
             layer_past_present_indices = None
-            cu_seqlens_q = None
             slice_past_index = len(hidden_states)
         # Decode
         else:
             # Create indices from cumulative sequence lengths
             layer_past_present_indices = cu_seqlens[1:] - 1
-            cu_seqlens_q = torch.arange(
-                cu_seqlens.shape[0], dtype=torch.int32, device=hidden_states.device
-            )
             slice_past_index = None
 
         # Get rotary cos and sin for this forward
@@ -726,6 +723,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
         input_ids,
         position_ids,
         cu_seqlens,
+        cu_seqlens_q,
         max_s,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
@@ -734,6 +732,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
             input_ids,
             position_ids,
             cu_seqlens,
+            cu_seqlens_q,
             max_s,
             past_key_values,
             pre_allocate_past_size,
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 71182f8d..20ad8385 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -484,6 +484,7 @@ class FlashSantacoderModel(nn.Module):
         input_ids,
         position_ids,
         cu_seqlens,
+        cu_seqlens_q,
         max_s,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
@@ -507,15 +508,11 @@ class FlashSantacoderModel(nn.Module):
                 )
             )
             layer_past_present_indices = None
-            cu_seqlens_q = None
             slice_past_index = len(hidden_states)
         # Decode
         else:
             # Create indices from cumulative sequence lengths
             layer_past_present_indices = cu_seqlens[1:] - 1
-            cu_seqlens_q = torch.arange(
-                cu_seqlens.shape[0], dtype=torch.int32, device=hidden_states.device
-            )
             slice_past_index = None
 
         residual = None
@@ -566,6 +563,7 @@ class FlashSantacoderForCausalLM(nn.Module):
         input_ids,
         position_ids,
         cu_seqlens,
+        cu_seqlens_q,
         max_s,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
@@ -574,6 +572,7 @@ class FlashSantacoderForCausalLM(nn.Module):
             input_ids,
             position_ids,
             cu_seqlens,
+            cu_seqlens_q,
             max_s,
             past_key_values,
             pre_allocate_past_size,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 413866d1..b51a3dc6 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1,6 +1,8 @@
 import torch
 import torch.distributed
 
+import numpy as np
+
 from torch.nn import functional as F
 
 from dataclasses import dataclass
@@ -33,12 +35,16 @@ class FlashCausalLMBatch(Batch):
     requests_idx_mapping: Dict[int, int]
 
     # Decoder values
-    input_ids: List[torch.Tensor]
-    position_ids: List[torch.Tensor]
+    input_ids: torch.Tensor
+    position_ids: torch.Tensor
+
     # cumulative sequence lengths
-    cu_seqlens: List[int]
+    cu_seqlens: torch.Tensor
+    # cumulative query sequence lengths, only used in decode
+    cu_seqlens_q: Optional[torch.Tensor]
+    # past key values, only used in decode
+    past_key_values: Optional[torch.Tensor]
     max_seqlen: int
-    past_key_values: Optional[Union[torch.Tensor, List[torch.Tensor]]]
 
     # All tokens
     all_input_ids: List[List[int]]
@@ -53,9 +59,6 @@ class FlashCausalLMBatch(Batch):
     next_token_choosers: List[NextTokenChooser]
     stopping_criterias: List[StoppingCriteria]
 
-    # Constant shared tensor, ref here just so that it's accessible in concatentate()
-    past_pad: Optional[torch.Tensor]
-
     # Maximum number of tokens this batch will grow to
     max_tokens: int
 
@@ -74,7 +77,6 @@ class FlashCausalLMBatch(Batch):
         tokenizer: PreTrainedTokenizerBase,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
-        input_ids = []
         position_ids = []
         cu_seqlens = [0]
         max_seqlen = 0
@@ -83,7 +85,6 @@ class FlashCausalLMBatch(Batch):
         offsets = []
         token_offsets = []
         all_input_ids = []
-        all_input_ids_tensor = []
         requests_idx_mapping = {}
 
         next_token_choosers = []
@@ -109,15 +110,11 @@ class FlashCausalLMBatch(Batch):
 
             offsets.append(None)
             token_offsets.append(None)
+
             all_input_ids.append(tokenized_input)
 
-            tokenized_input = torch.tensor(tokenized_input, device=device)
-            input_ids.append(tokenized_input)
-
             # Position ids
-            position_ids.append(
-                torch.arange(0, input_length, dtype=torch.int32, device=device)
-            )
+            position_ids.append(np.arange(0, input_length))
 
             # Add cumulative lengths of all previous inputs
             cu_seqlens.append(cumulative_length + input_length)
@@ -130,14 +127,19 @@ class FlashCausalLMBatch(Batch):
             max_new_tokens = stopping_criteria.max_new_tokens
             stopping_criterias.append(stopping_criteria)
 
-            all_input_ids_tensor.append(
-                F.pad(tokenized_input, (0, stopping_criteria.max_new_tokens))
-            )
-
             # Update
             cumulative_length += input_length
             max_tokens += input_length + max_new_tokens
 
+        # Create tensors on device
+        input_ids = torch.tensor(
+            np.concatenate(all_input_ids), dtype=torch.int64, device=device
+        )
+        position_ids = torch.tensor(
+            np.concatenate(position_ids), dtype=torch.int32, device=device
+        )
+        cu_seqlens = torch.tensor(cu_seqlens, device=device, dtype=torch.int32)
+
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
@@ -145,16 +147,16 @@ class FlashCausalLMBatch(Batch):
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlens=cu_seqlens,
+            cu_seqlens_q=None,
             max_seqlen=max_seqlen,
             past_key_values=None,
             input_lengths=input_lengths,
             offsets=offsets,
             token_offsets=token_offsets,
             all_input_ids=all_input_ids,
-            all_input_ids_tensor=all_input_ids_tensor,
+            all_input_ids_tensor=[],
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
-            past_pad=None,
             max_tokens=max_tokens,
         )
 
@@ -174,9 +176,13 @@ class FlashCausalLMBatch(Batch):
         # New values after filtering
         requests_idx_mapping = {}
 
-        input_ids = []
-        position_ids = []
-        cu_seqlens = [0]
+        input_ids = self.input_ids.new_empty(len(requests))
+        position_ids = self.position_ids.new_empty(len(requests))
+        # Create on CPU to only move to GPU once instead of at every copy
+        cu_seqlens = torch.zeros(len(requests) + 1, dtype=torch.int32)
+        cu_seqlens_q = torch.arange(
+            0, len(requests) + 1, device=self.cu_seqlens_q.device, dtype=torch.int32
+        )
         max_seqlen = 0
         past_key_values = []
 
@@ -199,16 +205,18 @@ class FlashCausalLMBatch(Batch):
             # Get length
             request_input_length = self.input_lengths[idx]
 
-            input_ids.append(self.input_ids[idx])
-            position_ids.append(self.position_ids[idx])
-            cu_seqlens.append(cumulative_length + request_input_length)
-            max_seqlen = max(max_seqlen, request_input_length)
-            # True index for past
-            past_key_values.append(self.past_key_values[2 * idx])
+            # Copy tensors (GPU)
+            input_ids[i] = self.input_ids[idx]
+            position_ids[i] = self.position_ids[idx]
 
-            if not single_request:
-                # Add one padding
-                past_key_values.append(self.past_pad)
+            # Copy to tensor (CPU)
+            cu_seqlens[i + 1] = cumulative_length + request_input_length
+            max_seqlen = max(max_seqlen, request_input_length)
+
+            # Slice from past
+            past_key_values.append(
+                self.past_key_values[:, self.cu_seqlens[idx] : self.cu_seqlens[idx + 1]]
+            )
 
             all_input_ids.append(self.all_input_ids[idx])
             all_input_ids_tensor.append(self.all_input_ids_tensor[idx])
@@ -229,7 +237,7 @@ class FlashCausalLMBatch(Batch):
 
         if single_request:
             # Preallocate tensor for bs = 1 case
-            past_key_values = torch.nn.functional.pad(
+            past_key_values = F.pad(
                 past_key_values[0],
                 (
                     0,
@@ -243,15 +251,21 @@ class FlashCausalLMBatch(Batch):
                     - stopping_criterias[0].current_tokens,
                 ),
             )
+        else:
+            # Cat all past
+            past_key_values = torch.cat(past_key_values, dim=1)
+
+        # Move to GPU now that we have the whole tensor
+        cu_seqlens = cu_seqlens.to(self.cu_seqlens.device)
 
         return FlashCausalLMBatch(
             batch_id=self.batch_id,
-            past_pad=self.past_pad,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlens=cu_seqlens,
+            cu_seqlens_q=cu_seqlens_q,
             max_seqlen=max_seqlen,
             past_key_values=past_key_values,
             input_lengths=input_lengths,
@@ -271,9 +285,16 @@ class FlashCausalLMBatch(Batch):
         requests = []
         requests_idx_mapping = {}
 
-        input_ids = []
-        position_ids = []
+        total_batch_size = sum([len(b) for b in batches])
+
+        device = batches[0].input_ids.device
+
+        input_ids = batches[0].input_ids.new_empty(total_batch_size)
+        position_ids = batches[0].position_ids.new_empty(total_batch_size)
         cu_seqlens = [0]
+        cu_seqlens_q = torch.arange(
+            0, total_batch_size + 1, device=device, dtype=torch.int32
+        )
         max_seqlen = 0
         past_key_values = []
 
@@ -302,22 +323,25 @@ class FlashCausalLMBatch(Batch):
                 for k, v in batch.requests_idx_mapping.items():
                     requests_idx_mapping[k] = v + cumulative_batch_size
 
-            input_ids.extend(batch.input_ids)
-            position_ids.extend(batch.position_ids)
+            start_index = cumulative_batch_size
+            end_index = cumulative_batch_size + len(batch)
+
+            # Copy tensors (GPU)
+            input_ids[start_index:end_index] = batch.input_ids
+            position_ids[start_index:end_index] = batch.position_ids
+
             # Add cumulative lengths of all previous inputs
             cu_seqlens.extend([l + cumulative_length for l in batch.cu_seqlens[1:]])
             max_seqlen = max(max_seqlen, batch.max_seqlen)
 
             if len(batch) != 1:
-                past_key_values.extend(batch.past_key_values)
+                past_key_values.append(batch.past_key_values)
             else:
                 # past was pre-allocated for this batch
                 # We need to slice to remove the padding
                 past_key_values.append(
                     batch.past_key_values[:, : batch.input_lengths[0]]
                 )
-                # Add one padding
-                past_key_values.append(batch.past_pad)
 
             all_input_ids.extend(batch.all_input_ids)
             all_input_ids_tensor.extend(batch.all_input_ids_tensor)
@@ -334,14 +358,19 @@ class FlashCausalLMBatch(Batch):
             cumulative_batch_size += len(batch)
             max_tokens += batch.max_tokens
 
+        # Cat past
+        past_key_values = torch.cat(past_key_values, dim=1)
+        # Create final tensor on GPU
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+
         return FlashCausalLMBatch(
             batch_id=batches[0].batch_id,
-            past_pad=batches[0].past_pad,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlens=cu_seqlens,
+            cu_seqlens_q=cu_seqlens_q,
             max_seqlen=max_seqlen,
             past_key_values=past_key_values,
             input_lengths=input_lengths,
@@ -367,10 +396,9 @@ class FlashCausalLM(Model):
         quantize: bool = False,
         decode_buffer: int = 3,
     ):
-        self.past_pad = None
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            dtype = torch.float16
         else:
             raise NotImplementedError("FlashCausalLM is only available on GPU")
 
@@ -410,6 +438,7 @@ class FlashCausalLM(Model):
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         cu_seqlens: torch.Tensor,
+        cu_seqlens_q: Optional[torch.Tensor],
         max_s: int,
         past_key_values: Optional = None,
         pre_allocate_past_size: Optional[int] = None,
@@ -419,6 +448,7 @@ class FlashCausalLM(Model):
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlens=cu_seqlens,
+            cu_seqlens_q=cu_seqlens_q,
             max_s=max_s,
             past_key_values=past_key_values,
             pre_allocate_past_size=pre_allocate_past_size,
@@ -428,22 +458,9 @@ class FlashCausalLM(Model):
     def generate_token(
         self, batch: FlashCausalLMBatch
     ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch]]:
-        # Shortcut when batch_size == 1
-        if len(batch) == 1:
-            input_ids = batch.input_ids[0].view(-1)
-            # No need to slice as flash attention will take care of it with cu_seqlens
-            past_key_values = batch.past_key_values
-        else:
-            # Concatenate tensors
-            input_ids = torch.cat(batch.input_ids).view(-1)
-            past_key_values = (
-                torch.cat(batch.past_key_values, dim=1)
-                if batch.past_key_values is not None
-                else None
-            )
+        prefill = batch.past_key_values is None
 
-        # if prefill and bs == 1
-        if past_key_values is None and len(batch) == 1:
+        if prefill and len(batch) == 1:
             # Ask to pre-allocate kv to its max size
             # == number of tokens + max_new_tokens
             pre_allocate_past_size = (
@@ -452,42 +469,74 @@ class FlashCausalLM(Model):
         else:
             pre_allocate_past_size = None
 
-        # Concatenate when prefill, torch.tensor when decode
-        position_ids = (
-            torch.tensor(batch.position_ids, device=self.device)
-            if batch.past_key_values is not None
-            else torch.cat(batch.position_ids)
-        )
-        cu_seqlens = torch.tensor(
-            batch.cu_seqlens, device=self.device, dtype=torch.int32
-        )
-
         out, present = self.forward(
-            input_ids,
-            position_ids,
-            cu_seqlens,
+            batch.input_ids,
+            batch.position_ids,
+            batch.cu_seqlens,
+            batch.cu_seqlens_q,
             batch.max_seqlen,
-            past_key_values,
+            batch.past_key_values,
             pre_allocate_past_size,
         )
 
-        # Initialize past_key_values in prefill
-        if batch.past_key_values is None:
-            # Initialize past padding tensor
-            if self.past_pad is None:
-                self.past_pad = present.new_zeros(
-                    present.shape[0], 1, *present.shape[2:]
+        if prefill:
+            if len(batch) > 1:
+                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
+                # When batch == 1, we will just use the batch.input_ids values directly
+                prefill_tokens_indices = batch.input_ids.new_zeros(len(batch.input_ids))
+
+            # Create batch.cu_seqlens_q for decode
+            batch.cu_seqlens_q = torch.arange(
+                0, len(batch) + 1, device=self.device, dtype=torch.int32
+            )
+            next_input_ids = batch.input_ids.new_empty(len(batch))
+            next_position_ids = batch.position_ids.new_empty(len(batch))
+        else:
+            prefill_logprobs = None
+            next_input_ids = batch.input_ids
+            next_position_ids = batch.position_ids
+
+        next_token_logprobs = out.new_empty(len(batch))
+
+        # Prepare past for next decode
+        if len(batch) > 1:
+            # Used to slice next batch past
+            past_indices = torch.empty(
+                present.shape[1], dtype=torch.int64, device=self.device
+            )
+            batch.past_key_values = present.new_empty(
+                (
+                    present.shape[0],
+                    present.shape[1] + len(batch.requests),
+                    *present.shape[2:],
                 )
-            # Set in batch in case it needs to be used later in concatenate()
-            batch.past_pad = self.past_pad
-            if len(batch) == 1:
-                # present is already pre-padded
-                batch.past_key_values = present
-            else:
-                # Add padding after each sequence
-                # This will have the correct shape after the final past_key_values concatenation before the model
-                # forward
-                batch.past_key_values = [None, self.past_pad] * len(batch)
+            )
+
+            # It is actually faster to do a whole other for loop here as the copy from present to past is fairly slow
+            # and will run asynchronously while we do the next for loop
+            cumulative_length = 0
+            for i, input_length in enumerate(batch.input_lengths):
+                # Indexing metadata
+                start_index = cumulative_length
+                end_index = cumulative_length + input_length
+
+                # Indices to copy present at the correct place in past_key_values
+                torch.arange(
+                    start_index + i,
+                    end_index + i,
+                    dtype=torch.int64,
+                    device=self.device,
+                    out=past_indices[start_index:end_index],
+                )
+                cumulative_length += input_length
+
+            # Copy from present to past_key_values
+            batch.past_key_values[:, past_indices] = present
+
+        # Initialize past_key_values in prefill for len(batch) == 1
+        elif prefill:
+            # present is already pre-padded
+            batch.past_key_values = present
 
         # Cumulative length
         cumulative_length = 0
@@ -496,6 +545,102 @@ class FlashCausalLM(Model):
         generations: List[Generation] = []
         stopped = True
 
+        # Zipped iterator
+        iterator = zip(
+            batch.input_lengths,
+            batch.next_token_choosers,
+            batch.stopping_criterias,
+            batch.all_input_ids,
+        )
+
+        # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
+        # one, we need to first do a GPU <-> CPU sync
+        # It is faster if we delay this sync for the maximum amount of time
+
+        # For each member of the batch
+        for i, (
+            input_length,
+            next_token_chooser,
+            stopping_criteria,
+            all_input_ids,
+        ) in enumerate(iterator):
+            # Indexing metadata
+            start_index = cumulative_length
+            end_index = cumulative_length + input_length
+
+            if prefill:
+                # Prefill mode
+                # out is of shape [cumulative_sequence_lengths, vocab_size]
+                # only take last token logit
+                logits = out[end_index - 1 : end_index]
+
+                # Create all_input_ids_tensor that will be used by token warpers (for example, RepetitionPenalty)
+                all_input_ids_tensor = batch.input_ids.new_empty(
+                    input_length + stopping_criteria.max_new_tokens
+                )
+                # Copy from batch.input_ids to all_input_ids_tensor
+                all_input_ids_tensor[:input_length] = batch.input_ids[
+                    start_index:end_index
+                ]
+                batch.all_input_ids_tensor.append(all_input_ids_tensor)
+
+                # Initialize position_ids
+                # In decode, we do not need this as we can just increment position ids
+                next_position_ids[i] = batch.position_ids[end_index - 1]
+
+                # Used to gather prefill logprobs
+                # Copy batch.input_ids to prefill_token_indices
+                if len(batch) > 1:
+                    prefill_tokens_indices[
+                        start_index : end_index - 1
+                    ] = batch.input_ids[start_index + 1 : end_index]
+                else:
+                    # Set prefill_tokens_indices to the correct slice
+                    prefill_tokens_indices = batch.input_ids[
+                        start_index + 1 : end_index
+                    ]
+            else:
+                # Decode mode
+                # out is of shape [batch_size, vocab_size]
+                logits = out[i].view(1, -1)
+
+            all_input_ids_tensor = batch.all_input_ids_tensor[i]
+
+            # Select next token
+            next_token_id, logprob = next_token_chooser(
+                all_input_ids_tensor[None, :input_length], logits
+            )
+
+            # Add to all_input_ids_tensor
+            next_token_id_squeezed = next_token_id.view(1)
+            all_input_ids_tensor[input_length] = next_token_id_squeezed
+
+            # Set values
+            next_input_ids[i] = next_token_id_squeezed
+            next_token_logprobs[i] = logprob[-1, next_token_id].view(1)
+
+            cumulative_length += input_length
+
+        # Set values in batch
+        batch.input_ids = next_input_ids
+        batch.position_ids = next_position_ids + 1
+        batch.cu_seqlens = batch.cu_seqlens + batch.cu_seqlens_q
+
+        if prefill:
+            # Get prefill logprobs
+            prefill_logprobs_tensor = torch.log_softmax(out, -1)
+            prefill_logprobs = torch.gather(
+                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
+            )
+            # GPU <-> CPU sync
+            prefill_logprobs = prefill_logprobs.view(-1).tolist()
+
+        # GPU <-> CPU sync
+        next_token_logprobs = next_token_logprobs.tolist()
+        next_token_ids = batch.input_ids.tolist()
+
+        cumulative_length = 0
+
         # Zipped iterator
         iterator = zip(
             batch.requests,
@@ -506,6 +651,8 @@ class FlashCausalLM(Model):
             batch.stopping_criterias,
             batch.all_input_ids,
             batch.all_input_ids_tensor,
+            next_token_ids,
+            next_token_logprobs,
         )
 
         # For each member of the batch
@@ -518,34 +665,16 @@ class FlashCausalLM(Model):
             stopping_criteria,
             all_input_ids,
             all_input_ids_tensor,
+            next_token_id,
+            next_token_logprob,
         ) in enumerate(iterator):
-            # Indexing metadata
             start_index = cumulative_length
             end_index = cumulative_length + input_length
 
-            prefill = stopping_criteria.current_tokens == 0
-            if prefill:
-                # Prefill mode
-                # out is of shape [cumulative_sequence_lengths, vocab_size]
-                logits = out[start_index:end_index]
-            else:
-                # Decode mode
-                # out is of shape [batch_size, vocab_size]
-                logits = out[i].unsqueeze(0)
-
-            # Select next token
-            next_token_id, logprobs = next_token_chooser(
-                all_input_ids_tensor[None, :input_length], logits
-            )
-            next_token_id_squeezed = next_token_id.squeeze()
-            next_token_id_item = next_token_id_squeezed.item()
-
             # Append next token to all tokens
-            all_input_ids.append(next_token_id_item)
-            all_input_ids_tensor[input_length] = next_token_id_item
+            all_input_ids.append(next_token_id)
 
             # Generated token
-            next_token_logprob = logprobs[-1, next_token_id_item]
             next_token_text, offset, token_offset = self.decode_token(
                 all_input_ids,
                 offset,
@@ -554,7 +683,7 @@ class FlashCausalLM(Model):
 
             # Evaluate stopping criteria
             stop, reason = stopping_criteria(
-                next_token_id_item,
+                next_token_id,
                 next_token_text,
             )
 
@@ -579,9 +708,9 @@ class FlashCausalLM(Model):
             # Prefill
             if prefill:
                 # Remove generated token to only have prefill and add nan for first prompt token
-                prefill_logprobs = [float("nan")] + logprobs.gather(
-                    1, all_input_ids_tensor[1:input_length].unsqueeze(1)
-                ).squeeze(1)[:-1].tolist()
+                request_prefill_logprobs = [float("nan")] + prefill_logprobs[
+                    start_index : end_index - 1
+                ]
                 prefill_token_ids = all_input_ids[:-1]
                 prefill_texts = self.tokenizer.batch_decode(
                     prefill_token_ids,
@@ -589,7 +718,7 @@ class FlashCausalLM(Model):
                     skip_special_tokens=False,
                 )
                 prefill_tokens = PrefillTokens(
-                    prefill_token_ids, prefill_logprobs, prefill_texts
+                    prefill_token_ids, request_prefill_logprobs, prefill_texts
                 )
             else:
                 prefill_tokens = None
@@ -597,31 +726,23 @@ class FlashCausalLM(Model):
             generation = Generation(
                 request.id,
                 prefill_tokens,
-                next_token_id_item,
+                next_token_id,
                 next_token_logprob,
                 next_token_text,
-                next_token_id_item in self.all_special_ids,
+                next_token_id in self.all_special_ids,
                 generated_text,
             )
 
             generations.append(generation)
-            cumulative_length += input_length
             new_input_length = input_length + 1
 
             # Update values
-            batch.input_ids[i] = next_token_id
-            batch.position_ids[i] = input_length
             batch.input_lengths[i] = new_input_length
             batch.offsets[i] = offset
             batch.token_offsets[i] = token_offset
             batch.all_input_ids[i] = all_input_ids
-            batch.all_input_ids_tensor[i] = all_input_ids_tensor
-            batch.max_seqlen = max(batch.max_seqlen, new_input_length)
-            if len(batch) != 1:
-                # Add each sequence before its padding
-                batch.past_key_values[i * 2] = present[:, start_index:end_index]
-            # Cumulative sum
-            batch.cu_seqlens[(i + 1)] = batch.cu_seqlens[i] + new_input_length
+            batch.max_seqlen = batch.max_seqlen + 1
+            cumulative_length += input_length
 
         # No need to return a batch if we know that all requests stopped
         return generations, batch if not stopped else None
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 105ff519..e4426771 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -32,7 +32,7 @@ class FlashLlama(FlashCausalLM):
         self.past_pad = None
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            dtype = torch.float16
         else:
             raise NotImplementedError("FlashLlama is only available on GPU")
 
@@ -161,7 +161,7 @@ class FlashLlamaSharded(FlashLlama):
         self.master = self.rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{self.rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            dtype = torch.float16
         else:
             raise NotImplementedError("FlashLlama is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index fc769583..f439e812 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -38,7 +38,7 @@ class FlashNeoXSharded(FlashNeoX):
         self.master = self.rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{self.rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            dtype = torch.float16
         else:
             raise NotImplementedError("FlashNeoX is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 333180e8..f0825ab9 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -31,7 +31,7 @@ class FlashSantacoder(FlashCausalLM):
         self.past_pad = None
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            dtype = torch.float16
         else:
             raise NotImplementedError("FlashSantacoder is only available on GPU")
 
@@ -178,7 +178,7 @@ class FlashSantacoderSharded(FlashSantacoder):
         self.master = self.rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{self.rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            dtype = torch.float16
         else:
             raise NotImplementedError("FlashSantacoderSharded is only available on GPU")
 

From 49cffad1bc42e892f63be173384dce26e185e420 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 9 May 2023 19:02:52 +0200
Subject: [PATCH 21/39] fix(docker): fix nvidia env vars (#305)

---
 Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index d68703cb..500f6580 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -123,7 +123,8 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
 
 # NVIDIA env vars
-ENV NVIDIA_REQUIRE_CUDA cuda>=11.8
+ENV CUDA_VERSION=11.8.0
+ENV NVIDIA_REQUIRE_CUDA=cuda>=11.8 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 # Required for nvidia-docker v1

From 158540446410cb231380f72ccc8449562939c8b3 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 10 May 2023 15:29:21 +0200
Subject: [PATCH 22/39] fix(docker): remove nvidia require cuda env (#310)

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 500f6580..c4494173 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -124,7 +124,6 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
 
 # NVIDIA env vars
 ENV CUDA_VERSION=11.8.0
-ENV NVIDIA_REQUIRE_CUDA=cuda>=11.8 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 # Required for nvidia-docker v1

From 68e9d6ab333715008c542467c8d5202cf4692253 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 10 May 2023 15:48:21 +0200
Subject: [PATCH 23/39] feat(server): shard token decode (#303)

---
 router/client/src/lib.rs                      |  2 +
 router/client/src/sharded_client.rs           | 24 ++++-
 server/text_generation_server/models/bloom.py | 12 ++-
 .../models/causal_lm.py                       | 96 ++++++++++---------
 .../custom_modeling/flash_llama_modeling.py   |  2 -
 .../custom_modeling/flash_neox_modeling.py    |  2 -
 .../models/flash_causal_lm.py                 | 90 +++++++++--------
 .../models/flash_llama.py                     | 12 ++-
 .../models/flash_neox.py                      | 12 ++-
 .../models/flash_santacoder.py                | 12 ++-
 .../models/galactica.py                       | 12 ++-
 .../text_generation_server/models/gpt_neox.py | 12 ++-
 server/text_generation_server/models/model.py |  4 +
 server/text_generation_server/models/opt.py   | 12 ++-
 .../models/seq2seq_lm.py                      | 80 +++++++++-------
 server/text_generation_server/models/t5.py    | 12 ++-
 server/text_generation_server/utils/tokens.py |  6 +-
 17 files changed, 224 insertions(+), 178 deletions(-)

diff --git a/router/client/src/lib.rs b/router/client/src/lib.rs
index 401082c5..66f2055a 100644
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@@ -23,6 +23,8 @@ pub enum ClientError {
     Connection(String),
     #[error("Server error: {0}")]
     Generation(String),
+    #[error("Sharded results are empty")]
+    EmptyResults,
 }
 
 impl From<Status> for ClientError {
diff --git a/router/client/src/sharded_client.rs b/router/client/src/sharded_client.rs
index 73524740..60b81fe6 100644
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@@ -1,6 +1,6 @@
 /// Multi shard Client
-use crate::Result;
 use crate::{Batch, Client, Generation, HealthResponse, Request, ShardInfo};
+use crate::{ClientError, Result};
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
@@ -98,8 +98,9 @@ impl ShardedClient {
             .iter_mut()
             .map(|client| Box::pin(client.prefill(batch.clone())))
             .collect();
-        // all shards return the same message
-        join_all(futures).await.pop().unwrap()
+        let results: Result<Vec<(Vec<Generation>, Option<Batch>)>> =
+            join_all(futures).await.into_iter().collect();
+        merge_generations(results?)
     }
 
     /// Generate one token for each request in the given cached batches
@@ -116,7 +117,20 @@ impl ShardedClient {
             .iter_mut()
             .map(|client| Box::pin(client.decode(batches.clone())))
             .collect();
-        // all shards return the same message
-        join_all(futures).await.pop().unwrap()
+        let results: Result<Vec<(Vec<Generation>, Option<Batch>)>> =
+            join_all(futures).await.into_iter().collect();
+        merge_generations(results?)
     }
 }
+
+/// Merge generations from the different model shards
+fn merge_generations(
+    mut results: Vec<(Vec<Generation>, Option<Batch>)>,
+) -> Result<(Vec<Generation>, Option<Batch>)> {
+    let (mut generations, next_batch) = results.pop().ok_or(ClientError::EmptyResults)?;
+
+    for (mut shard_generations, _) in results.into_iter() {
+        generations.append(&mut shard_generations);
+    }
+    Ok((generations, next_batch))
+}
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index f528a430..25ec8cb8 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -63,10 +63,10 @@ class BLOOMSharded(BLOOM):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
         else:
             device = torch.device("cpu")
@@ -94,8 +94,8 @@ class BLOOMSharded(BLOOM):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
@@ -105,6 +105,8 @@ class BLOOMSharded(BLOOM):
             dtype=dtype,
             device=device,
             decode_buffer=1,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 26a9a661..610dc4e2 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -549,7 +549,7 @@ class CausalLM(Model):
         ) in enumerate(iterator):
             # Select next token
             next_token_id, logprobs = next_token_chooser(
-                all_input_ids.view(1, -1), logits
+                all_input_ids.view(1, -1), logits[-1:, :]
             )
 
             # Append next token to all tokens
@@ -569,54 +569,60 @@ class CausalLM(Model):
                 next_token_text,
             )
 
-            if stop:
-                # Decode generated tokens
-                output_text = self.decode(
-                    all_input_ids[-stopping_criteria.current_tokens :, 0]
-                )
-                # Get seed
-                if isinstance(next_token_chooser.choice, Sampling):
-                    seed = next_token_chooser.choice.seed
-                else:
-                    seed = None
-
-                generated_text = GeneratedText(
-                    output_text, stopping_criteria.current_tokens, reason, seed
-                )
-            else:
-                # Keep request in the batch
-                generated_text = None
+            if not stop:
                 stopped = False
 
-            # Prefill
-            if stopping_criteria.current_tokens == 1:
-                # Remove generated token to only have prefill and add nan for first prompt token
-                prefill_logprobs = [float("nan")] + logprobs.gather(
-                    1, all_input_ids[1:]
-                ).squeeze(1)[-new_input_length:-1].tolist()
-                prefill_token_ids = all_input_ids[-new_input_length:-1]
-                prefill_texts = self.tokenizer.batch_decode(
-                    prefill_token_ids,
-                    clean_up_tokenization_spaces=False,
-                    skip_special_tokens=False,
-                )
-                prefill_tokens = PrefillTokens(
-                    prefill_token_ids, prefill_logprobs, prefill_texts
-                )
-            else:
-                prefill_tokens = None
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    output_text = self.decode(
+                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    )
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
 
-            generation = Generation(
-                request.id,
-                prefill_tokens,
-                next_token_id_squeezed,
-                next_token_logprob,
-                next_token_text,
-                next_token_id_squeezed.item() in self.all_special_ids,
-                generated_text,
-            )
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
 
-            generations.append(generation)
+                # Prefill
+                if stopping_criteria.current_tokens == 1:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    prefill_logprobs = [float("nan")] + torch.log_softmax(
+                        logits, -1
+                    ).gather(1, all_input_ids[1:]).squeeze(1)[
+                        -new_input_length:-1
+                    ].tolist()
+                    prefill_token_ids = all_input_ids[-new_input_length:-1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = PrefillTokens(
+                        prefill_token_ids, prefill_logprobs, prefill_texts
+                    )
+                else:
+                    prefill_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    next_token_id_squeezed,
+                    next_token_logprob,
+                    next_token_text,
+                    next_token_id_squeezed.item() in self.all_special_ids,
+                    generated_text,
+                )
+
+                generations.append(generation)
 
             # Update values
             batch.input_ids[i, 0] = next_token_id
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 1293124a..6ae869db 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -622,10 +622,8 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         self.process_group = process_group
         if self.process_group is not None:
             self.world_size = self.process_group.size()
-            self.rank = self.process_group.rank()
         else:
             self.world_size = 1
-            self.rank = 0
 
         self.model = FlashLlamaModel(config, process_group)
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index ae1465ab..d706df33 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -685,10 +685,8 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
         self.process_group = process_group
         if self.process_group is not None:
             self.world_size = self.process_group.size()
-            self.rank = self.process_group.rank()
         else:
             self.world_size = 1
-            self.rank = 0
 
         self.gpt_neox = FlashGPTNeoXModel(config, process_group)
 
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index b51a3dc6..e862cfeb 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -687,53 +687,59 @@ class FlashCausalLM(Model):
                 next_token_text,
             )
 
-            if stop:
-                # Decode generated tokens
-                output_text = self.decode(
-                    all_input_ids[-stopping_criteria.current_tokens :]
-                )
-                # Get seed
-                if isinstance(next_token_chooser.choice, Sampling):
-                    seed = next_token_chooser.choice.seed
-                else:
-                    seed = None
-
-                generated_text = GeneratedText(
-                    output_text, stopping_criteria.current_tokens, reason, seed
-                )
-            else:
+            if not stop:
                 stopped = False
-                generated_text = None
 
-            # Prefill
-            if prefill:
-                # Remove generated token to only have prefill and add nan for first prompt token
-                request_prefill_logprobs = [float("nan")] + prefill_logprobs[
-                    start_index : end_index - 1
-                ]
-                prefill_token_ids = all_input_ids[:-1]
-                prefill_texts = self.tokenizer.batch_decode(
-                    prefill_token_ids,
-                    clean_up_tokenization_spaces=False,
-                    skip_special_tokens=False,
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    output_text = self.decode(
+                        all_input_ids[-stopping_criteria.current_tokens :]
+                    )
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
+
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if prefill:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    request_prefill_logprobs = [float("nan")] + prefill_logprobs[
+                        start_index : end_index - 1
+                    ]
+                    prefill_token_ids = all_input_ids[:-1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = PrefillTokens(
+                        prefill_token_ids, request_prefill_logprobs, prefill_texts
+                    )
+                else:
+                    prefill_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    next_token_id,
+                    next_token_logprob,
+                    next_token_text,
+                    next_token_id in self.all_special_ids,
+                    generated_text,
                 )
-                prefill_tokens = PrefillTokens(
-                    prefill_token_ids, request_prefill_logprobs, prefill_texts
-                )
-            else:
-                prefill_tokens = None
 
-            generation = Generation(
-                request.id,
-                prefill_tokens,
-                next_token_id,
-                next_token_logprob,
-                next_token_text,
-                next_token_id in self.all_special_ids,
-                generated_text,
-            )
+                generations.append(generation)
 
-            generations.append(generation)
             new_input_length = input_length + 1
 
             # Update values
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index e4426771..a3ba2084 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -157,10 +157,10 @@ class FlashLlamaSharded(FlashLlama):
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
         self.past_pad = None
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
         else:
             raise NotImplementedError("FlashLlama is only available on GPU")
@@ -190,8 +190,8 @@ class FlashLlamaSharded(FlashLlama):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval().to(device)
         torch.distributed.barrier(group=self.process_group)
@@ -200,6 +200,8 @@ class FlashLlamaSharded(FlashLlama):
             requires_padding=False,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index f439e812..b3e1876f 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -34,10 +34,10 @@ class FlashNeoXSharded(FlashNeoX):
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
         self.past_pad = None
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
         else:
             raise NotImplementedError("FlashNeoX is only available on GPU")
@@ -64,8 +64,8 @@ class FlashNeoXSharded(FlashNeoX):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval().to(device)
         torch.distributed.barrier(group=self.process_group)
@@ -74,6 +74,8 @@ class FlashNeoXSharded(FlashNeoX):
             requires_padding=False,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index f0825ab9..6e5c231e 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -174,10 +174,10 @@ class FlashSantacoderSharded(FlashSantacoder):
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
         self.past_pad = None
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
         else:
             raise NotImplementedError("FlashSantacoderSharded is only available on GPU")
@@ -204,8 +204,8 @@ class FlashSantacoderSharded(FlashSantacoder):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
             transpose=config.architectures[0].startswith("GPT2"),
         )
         self.model = model.eval().to(device)
@@ -215,6 +215,8 @@ class FlashSantacoderSharded(FlashSantacoder):
             requires_padding=False,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index 2577f1b1..57df0bab 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -195,10 +195,10 @@ class GalacticaSharded(Galactica):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
         else:
             device = torch.device("cpu")
@@ -226,8 +226,8 @@ class GalacticaSharded(Galactica):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
@@ -236,6 +236,8 @@ class GalacticaSharded(Galactica):
             requires_padding=True,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index e73a3c82..c71bf366 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -34,10 +34,10 @@ class GPTNeoxSharded(CausalLM):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
         else:
             device = torch.device("cpu")
@@ -65,8 +65,8 @@ class GPTNeoxSharded(CausalLM):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
@@ -75,6 +75,8 @@ class GPTNeoxSharded(CausalLM):
             requires_padding=True,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index bfae829d..4c85c952 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -18,6 +18,8 @@ class Model(ABC):
         dtype: torch.dtype,
         device: torch.device,
         decode_buffer: int = 3,
+        rank: int = 0,
+        world_size: int = 1,
     ):
         if decode_buffer < 1:
             raise ValueError("decode_buffer must be >= 1")
@@ -28,6 +30,8 @@ class Model(ABC):
         self.dtype = dtype
         self.device = device
         self.decode_buffer = decode_buffer
+        self.rank = rank
+        self.world_size = world_size
 
     @property
     def info(self) -> InfoResponse:
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index 50e5271e..cdc32c56 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -50,10 +50,10 @@ class OPTSharded(OPT):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
         else:
             device = torch.device("cpu")
@@ -81,8 +81,8 @@ class OPTSharded(OPT):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
@@ -91,6 +91,8 @@ class OPTSharded(OPT):
             requires_padding=True,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 4ac5ed3c..d4a0ddcc 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -631,7 +631,7 @@ class Seq2SeqLM(Model):
         ) in enumerate(iterator):
             # Select next token
             next_token_id, logprobs = next_token_chooser(
-                all_decoder_input_ids.view(1, -1), logits
+                all_decoder_input_ids.view(1, -1), logits[-1:, :]
             )
 
             # Append next token to decoder tokens
@@ -650,46 +650,52 @@ class Seq2SeqLM(Model):
             # Evaluate stopping criteria
             stop, reason = stopping_criteria(next_token_id, next_token_text)
 
-            if stop:
-                # Slice with decoder_input_length to remove padding
-                # Decode all tokens
-                output_text = self.decode(all_decoder_input_ids[-decoder_input_length:])
-
-                # Get seed
-                if isinstance(next_token_chooser.choice, Sampling):
-                    seed = next_token_chooser.choice.seed
-                else:
-                    seed = None
-
-                generated_text = GeneratedText(
-                    output_text, stopping_criteria.current_tokens, reason, seed
-                )
-            else:
-                # Keep request in the batch
-                generated_text = None
+            if not stop:
                 stopped = False
 
-            # Prefill
-            if stopping_criteria.current_tokens == 1:
-                prefill_tokens = PrefillTokens(
-                    [self.tokenizer.bos_token_id],
-                    [float("nan")],
-                    [self.tokenizer.bos_token],
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Slice with decoder_input_length to remove padding
+                    # Decode all tokens
+                    output_text = self.decode(
+                        all_decoder_input_ids[-decoder_input_length:]
+                    )
+
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
+
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if stopping_criteria.current_tokens == 1:
+                    prefill_tokens = PrefillTokens(
+                        [self.tokenizer.bos_token_id],
+                        [float("nan")],
+                        [self.tokenizer.bos_token],
+                    )
+                else:
+                    prefill_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    next_token_id_squeezed,
+                    next_token_logprob,
+                    next_token_text,
+                    next_token_id_squeezed.item() in self.all_special_ids,
+                    generated_text,
                 )
-            else:
-                prefill_tokens = None
 
-            generation = Generation(
-                request.id,
-                prefill_tokens,
-                next_token_id_squeezed,
-                next_token_logprob,
-                next_token_text,
-                next_token_id_squeezed.item() in self.all_special_ids,
-                generated_text,
-            )
-
-            generations.append(generation)
+                generations.append(generation)
 
             # Update values
             batch.decoder_input_ids[i] = next_token_id
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 9e8c3c4c..5691c005 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -34,10 +34,10 @@ class T5Sharded(Seq2SeqLM):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
-        self.master = self.rank == 0
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.master = rank == 0
         if torch.cuda.is_available():
-            device = torch.device(f"cuda:{self.rank}")
+            device = torch.device(f"cuda:{rank}")
             dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
         else:
             device = torch.device("cpu")
@@ -65,8 +65,8 @@ class T5Sharded(Seq2SeqLM):
             quantize=quantize,
             device=device,
             dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+            rank=rank,
+            world_size=world_size,
         )
         self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
@@ -75,6 +75,8 @@ class T5Sharded(Seq2SeqLM):
             requires_padding=True,
             dtype=dtype,
             device=device,
+            rank=rank,
+            world_size=world_size,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index 23f504c6..d5a77170 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -75,11 +75,7 @@ class NextTokenChooser:
 
     def __call__(self, input_ids, scores):
         # Warp logits
-        if scores.shape[0] > 1:
-            # only warp the last token logits
-            scores[-1:, :] = self.warpers(input_ids, scores[-1:, :])
-        else:
-            scores = self.warpers(input_ids, scores)
+        scores = self.warpers(input_ids, scores)
 
         # Compute logprobs
         logprobs = torch.log_softmax(scores, -1)

From 745f596c880fdeb4766ca36e29bbf8a7e562e661 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 10 May 2023 15:51:10 +0200
Subject: [PATCH 24/39] feat(server): use float16 (#304)

---
 server/text_generation_server/models/bloom.py      | 2 +-
 server/text_generation_server/models/causal_lm.py  | 2 +-
 server/text_generation_server/models/galactica.py  | 2 +-
 server/text_generation_server/models/gpt_neox.py   | 2 +-
 server/text_generation_server/models/opt.py        | 2 +-
 server/text_generation_server/models/santacoder.py | 2 +-
 server/text_generation_server/models/seq2seq_lm.py | 2 +-
 server/text_generation_server/models/t5.py         | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 25ec8cb8..7c50644a 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -67,7 +67,7 @@ class BLOOMSharded(BLOOM):
         self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             device = torch.device("cpu")
             dtype = torch.float32
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 610dc4e2..f838fc5c 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -452,7 +452,7 @@ class CausalLM(Model):
     ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             if quantize:
                 raise ValueError("quantization is not available on CPU")
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index 57df0bab..f6098e6c 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -199,7 +199,7 @@ class GalacticaSharded(Galactica):
         self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             device = torch.device("cpu")
             dtype = torch.float32
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index c71bf366..cd36dba0 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -38,7 +38,7 @@ class GPTNeoxSharded(CausalLM):
         self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             device = torch.device("cpu")
             dtype = torch.float32
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index cdc32c56..44f15df3 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -54,7 +54,7 @@ class OPTSharded(OPT):
         self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             device = torch.device("cpu")
             dtype = torch.float32
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
index a7b09a82..5a142676 100644
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -17,7 +17,7 @@ class SantaCoder(CausalLM):
     def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             if quantize:
                 raise ValueError("quantization is not available on CPU")
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index d4a0ddcc..77912cff 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -506,7 +506,7 @@ class Seq2SeqLM(Model):
     ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             if quantize:
                 raise ValueError("quantization is not available on CPU")
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 5691c005..4cfdea9e 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -38,7 +38,7 @@ class T5Sharded(Seq2SeqLM):
         self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+            dtype = torch.float16
         else:
             device = torch.device("cpu")
             dtype = torch.float32

From 35ab6cfcf138c8d47e825f801a010b648c7502da Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Wed, 10 May 2023 16:16:06 +0200
Subject: [PATCH 25/39] fix(docker): remove CUDA_VERSION

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index c4494173..50145395 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -123,7 +123,6 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
 
 # NVIDIA env vars
-ENV CUDA_VERSION=11.8.0
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 # Required for nvidia-docker v1

From a6c18c39bb9431e6d3aba407f1d30eb43d82c04d Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Wed, 10 May 2023 19:08:54 +0200
Subject: [PATCH 26/39] feat(server): use cuda graph in logits warping (#302)

---
 server/text_generation_server/utils/tokens.py | 125 +++++++++++++-----
 1 file changed, 93 insertions(+), 32 deletions(-)

diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index d5a77170..c3477eef 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -1,8 +1,8 @@
 import re
 import torch
 
+from functools import lru_cache
 from transformers import (
-    LogitsProcessorList,
     TemperatureLogitsWarper,
     TopKLogitsWarper,
     TopPLogitsWarper,
@@ -25,8 +25,10 @@ class Sampling:
 
     def __call__(self, logits):
         probs = torch.nn.functional.softmax(logits, -1)
-        next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
-        return next_tokens
+        # See: https://github.com/pytorch/pytorch/blob/925a3788ec5c06db62ca732a0e9425a26a00916f/aten/src/ATen/native/Distributions.cpp#L631-L637
+        q = torch.empty_like(probs).exponential_(1, generator=self.generator).div_(probs)
+
+        return q.argmax()
 
 
 class Greedy:
@@ -34,6 +36,63 @@ class Greedy:
         return logits.argmax()
 
 
+class StaticWarper:
+    def __init__(
+        self,
+        temperature=1.0,
+        top_k=None,
+        top_p=None,
+        typical_p=None,
+    ):
+        self.warpers = []
+
+        if temperature is not None and temperature != 1.0:
+            temperature = float(temperature)
+            self.warpers.append(TemperatureLogitsWarper(temperature))
+        if top_k is not None and top_k != 0:
+            self.warpers.append(TopKLogitsWarper(top_k=top_k))
+        if top_p is not None and top_p < 1.0:
+            self.warpers.append(TopPLogitsWarper(top_p=top_p))
+        if typical_p is not None and typical_p < 1.0:
+            self.warpers.append(TypicalLogitsWarper(mass=typical_p))
+
+        self.cuda_graph = None
+        self.static_scores = None
+        self.static_warped_scores = None
+        self.static_next_logprob = None
+
+    def __call__(self, scores):
+        if self.cuda_graph is None:
+            self.static_scores = scores
+            self.cuda_graph = torch.cuda.CUDAGraph()
+
+            with torch.cuda.graph(self.cuda_graph):
+                for warper in self.warpers:
+                    self.static_warped_scores = warper(None, self.static_scores)
+
+                # Compute logprobs
+                self.static_next_logprob = torch.log_softmax(
+                    self.static_warped_scores, -1
+                )
+
+        self.static_scores.copy_(scores)
+        self.cuda_graph.replay()
+
+        return self.static_warped_scores, self.static_next_logprob
+
+
+@lru_cache(10)
+def static_warper(
+    temperature: Optional[float],
+    top_k: Optional[int],
+    top_p: Optional[float],
+    typical_p: Optional[float],
+) -> StaticWarper:
+    return StaticWarper(
+        temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
+    )
+
+
 class NextTokenChooser:
     def __init__(
         self,
@@ -47,43 +106,45 @@ class NextTokenChooser:
         seed=0,
         device="cpu",
     ):
-        warpers = LogitsProcessorList()
-        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
-        # all samplers can be found in `generation_utils_samplers.py`
-        sampling = do_sample
+        self.watermark_processor = (
+            WatermarkLogitsProcessor(device=device) if watermark else None
+        )
+        self.repetition_processor = (
+            RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
+            if repetition_penalty
+            else None
+        )
 
-        if watermark:
-            warpers.append(WatermarkLogitsProcessor(device=device))
-        if repetition_penalty is not None and repetition_penalty != 1.0:
-            warpers.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
-        if temperature is not None and temperature != 1.0:
-            temperature = float(temperature)
-            warpers.append(TemperatureLogitsWarper(temperature))
-            sampling = True
-        if top_k is not None and top_k != 0:
-            warpers.append(TopKLogitsWarper(top_k=top_k))
-            sampling = True
-        if top_p is not None and top_p < 1.0:
-            warpers.append(TopPLogitsWarper(top_p=top_p))
-            sampling = True
-        if typical_p is not None and typical_p < 1.0:
-            warpers.append(TypicalLogitsWarper(mass=typical_p))
-            sampling = True
+        has_warpers = (
+            (temperature is not None and temperature != 1.0)
+            or (top_k is not None and top_k != 0)
+            or (top_p is not None and top_p < 1.0)
+            or (typical_p is not None and typical_p < 1.0)
+        )
+        if has_warpers:
+            self.static_warper = static_warper(
+                temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
+            )
+        else:
+            self.static_warper = None
 
-        self.warpers = warpers
+        sampling = do_sample or has_warpers
         self.choice = Sampling(seed, device) if sampling else Greedy()
 
     def __call__(self, input_ids, scores):
-        # Warp logits
-        scores = self.warpers(input_ids, scores)
+        if self.watermark_processor:
+            scores = self.watermark_processor(input_ids, scores)
+        if self.repetition_processor:
+            scores = self.repetition_processor(input_ids, scores)
 
-        # Compute logprobs
-        logprobs = torch.log_softmax(scores, -1)
+        if self.static_warper is None:
+            next_logprob = torch.log_softmax(scores, -1)
+        else:
+            scores, next_logprob = self.static_warper(scores)
 
-        # Choose tokens
-        next_id = self.choice(scores[-1])
+        next_id = self.choice(scores[-1]).view(1, 1)
 
-        return next_id.view(1, 1), logprobs
+        return next_id, next_logprob
 
     @classmethod
     def from_pb(

From 4f6d038c0b92ef84e3b02940a0277d1b6ab47c10 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Thu, 11 May 2023 13:30:38 +0200
Subject: [PATCH 27/39] fix(server): fix multinomial implem in Sampling

---
 server/text_generation_server/utils/tokens.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index c3477eef..e9fb96b0 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -25,10 +25,10 @@ class Sampling:
 
     def __call__(self, logits):
         probs = torch.nn.functional.softmax(logits, -1)
+        # Avoid GPU<->CPU sync done by torch multinomial
         # See: https://github.com/pytorch/pytorch/blob/925a3788ec5c06db62ca732a0e9425a26a00916f/aten/src/ATen/native/Distributions.cpp#L631-L637
-        q = torch.empty_like(probs).exponential_(1, generator=self.generator).div_(probs)
-
-        return q.argmax()
+        q = torch.empty_like(probs).exponential_(1, generator=self.generator)
+        return probs.div_(q).argmax()
 
 
 class Greedy:

From 76a48cd365e44a467ff3f5a5ee6db1a48a60e267 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 12 May 2023 14:46:41 +0200
Subject: [PATCH 28/39] feat(server): GPTQ quantization (step1) (#277)

Changes only the type from `bool` to `Option<Enum>` pretty much
everywhere.
- Use `Optional[str]` in Python (easier to manage than importing type
everywhere). Except for the cli to get proper validation
- Updated all models to handle gracefully new values. (Error out if
unknown value, or gptq since not implemented).

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 launcher/src/main.rs                          | 37 +++++++++++++++----
 server/text_generation_server/cli.py          | 10 ++++-
 .../text_generation_server/models/__init__.py |  2 +-
 server/text_generation_server/models/bloom.py | 25 ++++++++++---
 .../models/causal_lm.py                       |  4 +-
 .../custom_modeling/flash_llama_modeling.py   |  8 +++-
 .../custom_modeling/flash_neox_modeling.py    | 10 +++--
 .../flash_santacoder_modeling.py              | 10 +++--
 .../models/flash_causal_lm.py                 |  4 +-
 .../models/flash_llama.py                     |  5 ++-
 .../models/galactica.py                       | 18 ++++++---
 .../text_generation_server/models/gpt_neox.py | 18 ++++++---
 .../models/santacoder.py                      |  9 ++++-
 .../models/seq2seq_lm.py                      |  4 +-
 server/text_generation_server/models/t5.py    | 17 +++++++--
 server/text_generation_server/server.py       |  4 +-
 16 files changed, 136 insertions(+), 49 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 867bfd3d..4a6cd621 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1,4 +1,4 @@
-use clap::Parser;
+use clap::{Parser, ValueEnum};
 use serde::Deserialize;
 use std::env;
 use std::ffi::OsString;
@@ -16,6 +16,26 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
 
 mod env_runtime;
 
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Quantization {
+    Bitsandbytes,
+    Gptq,
+}
+
+impl std::fmt::Display for Quantization {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // To keep in track with `server`.
+        match self {
+            Quantization::Bitsandbytes => {
+                write!(f, "bitsandbytes")
+            }
+            Quantization::Gptq => {
+                write!(f, "gptq")
+            }
+        }
+    }
+}
+
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
@@ -46,10 +66,10 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Wether you want the model to be quantized or not. This will use bitsandbytes for
-    /// quantization on the fly.
-    #[clap(long, env)]
-    quantize: bool,
+    /// Wether you want the model to be quantized or not. This will use `bitsandbytes` for
+    /// quantization on the fly, or `gptq`.
+    #[clap(long, env, value_enum)]
+    quantize: Option<Quantization>,
 
     /// The maximum amount of concurrent requests for this particular deployment.
     /// Having a low limit will refuse clients requests instead of having them
@@ -218,7 +238,7 @@ enum ShardStatus {
 fn shard_manager(
     model_id: String,
     revision: Option<String>,
-    quantize: bool,
+    quantize: Option<Quantization>,
     uds_path: String,
     rank: usize,
     world_size: usize,
@@ -257,8 +277,9 @@ fn shard_manager(
         shard_argv.push("--sharded".to_string());
     }
 
-    if quantize {
-        shard_argv.push("--quantize".to_string())
+    if let Some(quantize) = quantize {
+        shard_argv.push("--quantize".to_string());
+        shard_argv.push(quantize.to_string())
     }
 
     // Model optional revision
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index 92482a94..35cad20f 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -5,17 +5,23 @@ import typer
 from pathlib import Path
 from loguru import logger
 from typing import Optional
+from enum import Enum
 
 
 app = typer.Typer()
 
 
+class Quantization(str, Enum):
+    bitsandbytes = "bitsandbytes"
+    gptq = "gptq"
+
+
 @app.command()
 def serve(
     model_id: str,
     revision: Optional[str] = None,
     sharded: bool = False,
-    quantize: bool = False,
+    quantize: Optional[Quantization] = None,
     uds_path: Path = "/tmp/text-generation-server",
     logger_level: str = "INFO",
     json_output: bool = False,
@@ -55,6 +61,8 @@ def serve(
     if otlp_endpoint is not None:
         setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
 
+    # Downgrade enum into str for easier management later on
+    quantize = None if quantize is None else quantize.value
     server.serve(model_id, revision, sharded, quantize, uds_path)
 
 
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 221c9139..e02be3de 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -91,7 +91,7 @@ torch.set_grad_enabled(False)
 
 
 def get_model(
-    model_id: str, revision: Optional[str], sharded: bool, quantize: bool
+    model_id: str, revision: Optional[str], sharded: bool, quantize: Optional[str]
 ) -> Model:
     if "facebook/galactica" in model_id:
         if sharded:
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 7c50644a..877acb00 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -49,7 +49,12 @@ class BloomCausalLMBatch(CausalLMBatch):
 
 
 class BLOOM(CausalLM):
-    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+    ):
         super(BLOOM, self).__init__(
             model_id=model_id, revision=revision, quantize=quantize, decode_buffer=1
         )
@@ -61,7 +66,10 @@ class BLOOM(CausalLM):
 
 class BLOOMSharded(BLOOM):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         self.master = rank == 0
@@ -113,7 +121,7 @@ class BLOOMSharded(BLOOM):
     def load_weights(
         model,
         filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
         rank: int,
@@ -167,7 +175,7 @@ class BLOOMSharded(BLOOM):
 
                     tensor = tensor.contiguous().to(dtype)
 
-                    if quantize:
+                    if quantize == "bitsandbytes":
                         if not HAS_BITS_AND_BYTES:
                             raise ImportError(
                                 "bitsandbytes is not available on your machine either because it is not installed "
@@ -217,9 +225,14 @@ class BLOOMSharded(BLOOM):
                                 return linear
 
                             module.linear = replace_linear(state)
-
-                        else:
+                        elif quantize == "gptq":
+                            raise NotImplementedError(
+                                "`gptq` is not implemented for now"
+                            )
+                        elif quantize is None:
                             tensor = tensor.to(device)
+                        else:
+                            raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     module._parameters[param_name] = tensor
                     if name == "word_embeddings.weight":
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index f838fc5c..0d521ac4 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -447,7 +447,7 @@ class CausalLM(Model):
         self,
         model_id: str,
         revision: Optional[str] = None,
-        quantize: bool = False,
+        quantize: Optional[str] = None,
         decode_buffer: int = 3,
     ):
         if torch.cuda.is_available():
@@ -468,7 +468,7 @@ class CausalLM(Model):
             revision=revision,
             torch_dtype=dtype,
             device_map="auto" if torch.cuda.is_available() else None,
-            load_in_8bit=quantize,
+            load_in_8bit=quantize == "bitsandbytes",
         ).eval()
         tokenizer.pad_token_id = (
             self.model.config.pad_token_id
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 6ae869db..5b32b22e 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -105,7 +105,7 @@ class FastLinear(nn.Linear):
         self.bnb_linear = None
 
     def prepare_weights(self, quantize: bool = False):
-        if quantize:
+        if quantize == "bitsandbytes":
             if not HAS_BITS_AND_BYTES:
                 raise ImportError(
                     "bitsandbytes is not available on your machine either because it is not installed "
@@ -129,8 +129,12 @@ class FastLinear(nn.Linear):
             # Delete reference to data
             self.weight = None
             self.bias = None
-        else:
+        elif quantize == "gptq":
+            raise NotImplementedError("`gptq` is not implemented for now")
+        elif quantize is None:
             self.weight = nn.Parameter(self.weight.T)
+        else:
+            raise ValueError(f"Unexpected quantize `{quantize}`")
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         if self.quantized:
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index d706df33..e7b878c0 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -92,8 +92,8 @@ class FastLinear(nn.Linear):
         self.quantized = False
         self.bnb_linear = None
 
-    def prepare_weights(self, quantize: bool = False):
-        if quantize:
+    def prepare_weights(self, quantize: Optional[str] = None):
+        if quantize == "bitsandbytes":
             if not HAS_BITS_AND_BYTES:
                 raise ImportError(
                     "bitsandbytes is not available on your machine either because it is not installed "
@@ -117,8 +117,12 @@ class FastLinear(nn.Linear):
             # Delete reference to data
             self.weight = None
             self.bias = None
-        else:
+        elif quantize == "gptq":
+            raise NotImplementedError("`gptq` is not implemented for now")
+        elif quantize is None:
             self.weight = nn.Parameter(self.weight.T)
+        else:
+            raise ValueError(f"Unexpected quantize `{quantize}`")
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         if self.quantized:
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 20ad8385..309ec19f 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -67,8 +67,8 @@ class FastLinear(nn.Linear):
         self.quantized = False
         self.bnb_linear = None
 
-    def prepare_weights(self, quantize: bool = False):
-        if quantize:
+    def prepare_weights(self, quantize: Optional[str] = None):
+        if quantize == "bitsandbytes":
             if not HAS_BITS_AND_BYTES:
                 raise ImportError(
                     "bitsandbytes is not available on your machine either because it is not installed "
@@ -92,8 +92,12 @@ class FastLinear(nn.Linear):
             # Delete reference to data
             self.weight = None
             self.bias = None
-        else:
+        elif quantize == "gptq":
+            raise NotImplementedError("`gptq` is not implemented for now")
+        elif quantize is None:
             self.weight = nn.Parameter(self.weight.T)
+        else:
+            raise ValueError(f"Unexpected quantize `{quantize}`")
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         if self.quantized:
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index e862cfeb..0a9fccca 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -393,7 +393,7 @@ class FlashCausalLM(Model):
         model_cls: Type[PreTrainedModel],
         model_id: str,
         revision: Optional[str] = None,
-        quantize: bool = False,
+        quantize: Optional[str] = None,
         decode_buffer: int = 3,
     ):
         if torch.cuda.is_available():
@@ -410,7 +410,7 @@ class FlashCausalLM(Model):
                 model_id,
                 revision=revision,
                 torch_dtype=dtype,
-                load_in_8bit=quantize,
+                load_in_8bit=quantize == "bitsandbytes",
             )
             .eval()
             .to(device)
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index a3ba2084..5f47cf66 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -154,7 +154,10 @@ class FlashLlama(FlashCausalLM):
 
 class FlashLlamaSharded(FlashLlama):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.past_pad = None
         self.process_group, rank, world_size = initialize_torch_distributed()
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index f6098e6c..4f94b348 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -193,7 +193,10 @@ class Galactica(OPT):
 
 class GalacticaSharded(Galactica):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         self.master = rank == 0
@@ -244,7 +247,7 @@ class GalacticaSharded(Galactica):
     def load_weights(
         model,
         filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
         rank: int,
@@ -299,7 +302,7 @@ class GalacticaSharded(Galactica):
 
                     tensor = tensor.contiguous().to(dtype)
 
-                    if quantize:
+                    if quantize == "bitsandbytes":
                         if not HAS_BITS_AND_BYTES:
                             raise ImportError(
                                 "bitsandbytes is not available on your machine either because it is not installed "
@@ -349,9 +352,14 @@ class GalacticaSharded(Galactica):
                                 return linear
 
                             module.linear = replace_linear(state)
-
-                        else:
+                        elif quantize == "gptq":
+                            raise NotImplementedError(
+                                "`gptq` is not implemented for now"
+                            )
+                        elif quantize is None:
                             tensor = tensor.to(device)
+                        else:
+                            raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     module._parameters[param_name] = tensor
                     if name == "model.decoder.embed_tokens.weight":
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index cd36dba0..2d42e0b0 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -32,7 +32,10 @@ except Exception as e:
 
 class GPTNeoxSharded(CausalLM):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         self.master = rank == 0
@@ -83,7 +86,7 @@ class GPTNeoxSharded(CausalLM):
     def load_weights(
         model,
         filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
         rank: int,
@@ -148,7 +151,7 @@ class GPTNeoxSharded(CausalLM):
 
                     tensor = tensor.contiguous().to(dtype)
 
-                    if quantize:
+                    if quantize == "bitsandbytes":
                         if not HAS_BITS_AND_BYTES:
                             raise ImportError(
                                 "bitsandbytes is not available on your machine either because it is not installed "
@@ -198,9 +201,14 @@ class GPTNeoxSharded(CausalLM):
                                 return linear
 
                             module.linear = replace_linear(state)
-
-                        else:
+                        elif quantize == "gptq":
+                            raise NotImplementedError(
+                                "`gptq` is not implemented for now"
+                            )
+                        elif quantize is None:
                             tensor = tensor.to(device)
+                        else:
+                            raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     if current_parameter_tensor is not None:
                         module._parameters[param_name] = tensor
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
index 5a142676..4bd56de1 100644
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -14,7 +14,12 @@ EOD = "<|endoftext|>"
 
 
 class SantaCoder(CausalLM):
-    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+    ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16
@@ -46,7 +51,7 @@ class SantaCoder(CausalLM):
                 model_id,
                 revision=revision,
                 torch_dtype=dtype,
-                load_in_8bit=quantize,
+                load_in_8bit=quantize == "bitsandbytes",
                 trust_remote_code=True,  # required
             )
             .to(device)
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 77912cff..84854f5d 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -501,7 +501,7 @@ class Seq2SeqLM(Model):
         self,
         model_id: str,
         revision: Optional[str] = None,
-        quantize: bool = False,
+        quantize: Optional[str] = None,
         decode_buffer: int = 3,
     ):
         if torch.cuda.is_available():
@@ -519,7 +519,7 @@ class Seq2SeqLM(Model):
             revision=revision,
             torch_dtype=dtype,
             device_map="auto" if torch.cuda.is_available() else None,
-            load_in_8bit=quantize,
+            load_in_8bit=quantize == "bitsandbytes",
         ).eval()
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, revision=revision, padding_side="left", truncation_side="left"
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 4cfdea9e..381617b7 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -32,7 +32,10 @@ except Exception as e:
 
 class T5Sharded(Seq2SeqLM):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         self.master = rank == 0
@@ -83,7 +86,7 @@ class T5Sharded(Seq2SeqLM):
     def load_weights(
         model,
         filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
         rank: int,
@@ -154,7 +157,7 @@ class T5Sharded(Seq2SeqLM):
 
                     tensor = tensor.contiguous().to(dtype)
 
-                    if quantize:
+                    if quantize == "bitsandbytes":
                         if not HAS_BITS_AND_BYTES:
                             raise ImportError(
                                 "bitsandbytes is not available on your machine either because it is not installed "
@@ -205,8 +208,14 @@ class T5Sharded(Seq2SeqLM):
 
                             module.linear = replace_linear(state)
 
-                        else:
+                        elif quantize == "gptq":
+                            raise NotImplementedError(
+                                "`gptq` is not implemented for now"
+                            )
+                        elif quantize is None:
                             tensor = tensor.to(device)
+                        else:
+                            raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     if current_parameter_tensor is not None:
                         module._parameters[param_name] = tensor
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 70f08ed7..d715207b 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -100,14 +100,14 @@ def serve(
     model_id: str,
     revision: Optional[str],
     sharded: bool,
-    quantize: bool,
+    quantize: Optional[str],
     uds_path: Path,
 ):
     async def serve_inner(
         model_id: str,
         revision: Optional[str],
         sharded: bool = False,
-        quantize: bool = False,
+        quantize: Optional[str] = None,
     ):
         unix_socket_template = "unix://{}-{}"
         if sharded:

From 8a8f43410d0b37c496bda429741a0651d3515765 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Fri, 12 May 2023 17:32:40 +0200
Subject: [PATCH 29/39] chore(docker): use nvidia base image (#318)

---
 Dockerfile                                    | 13 +---------
 .../flash_santacoder_modeling.py              | 26 ++++++++++++++-----
 .../models/flash_santacoder.py                |  1 +
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 50145395..930d7ad7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -108,7 +108,7 @@ COPY server/Makefile-transformers Makefile
 RUN BUILD_EXTENSIONS="True" make build-transformers
 
 # Text Generation Inference base image
-FROM debian:bullseye-slim as base
+FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as base
 
 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@@ -122,17 +122,6 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     NUM_SHARD=1 \
     PORT=80
 
-# NVIDIA env vars
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-# Required for nvidia-docker v1
-RUN /bin/bash -c echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-
 WORKDIR /usr/src
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 309ec19f..23c3ea28 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -585,13 +585,25 @@ class FlashSantacoderForCausalLM(nn.Module):
 
         if self.transformer.tp_embeddings:
             # Logits are sharded, so we need to gather them
-            world_logits = [
-                torch.empty_like(logits) for _ in range(self.transformer.tp_world_size)
-            ]
-            torch.distributed.all_gather(
-                world_logits, logits, group=self.transformer.process_group
-            )
-            world_logits = torch.cat(world_logits, dim=1)
+            if logits.shape[0] == 1:
+                # Fast path when batch size is 1
+                world_logits = logits.new_empty(
+                    (logits.shape[1] * self.transformer.tp_world_size)
+                )
+                torch.distributed.all_gather_into_tensor(
+                    world_logits, logits.view(-1), group=self.transformer.process_group
+                )
+                world_logits = world_logits.view(1, -1)
+            else:
+                # We cannot use all_gather_into_tensor as it only support concatenating on the first dim
+                world_logits = [
+                    torch.empty_like(logits)
+                    for _ in range(self.transformer.tp_world_size)
+                ]
+                torch.distributed.all_gather(
+                    world_logits, logits, group=self.transformer.process_group
+                )
+                world_logits = torch.cat(world_logits, dim=1)
 
             return world_logits, present
 
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 6e5c231e..c463ee98 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -217,6 +217,7 @@ class FlashSantacoderSharded(FlashSantacoder):
             device=device,
             rank=rank,
             world_size=world_size,
+            decode_buffer=1,
         )
 
     @staticmethod

From 119f7e0687df3c3b6160ad9e7801cfb6151d890d Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Fri, 12 May 2023 17:56:32 +0200
Subject: [PATCH 30/39] fix(docker): remove quantize default

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 930d7ad7..3d44d133 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -118,7 +118,6 @@ ENV PATH=/opt/conda/bin:$PATH \
 ENV HUGGINGFACE_HUB_CACHE=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     MODEL_ID=bigscience/bloom-560m \
-    QUANTIZE=false \
     NUM_SHARD=1 \
     PORT=80
 

From 22c4fd07abe9d499cd8eda807f389084773124bd Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Fri, 12 May 2023 18:38:59 +0200
Subject: [PATCH 31/39] fix(docker): use ubuntu20.04

---
 Dockerfile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3d44d133..483270a8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -108,7 +108,7 @@ COPY server/Makefile-transformers Makefile
 RUN BUILD_EXTENSIONS="True" make build-transformers
 
 # Text Generation Inference base image
-FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as base
+FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
 
 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@@ -117,8 +117,6 @@ ENV PATH=/opt/conda/bin:$PATH \
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
-    MODEL_ID=bigscience/bloom-560m \
-    NUM_SHARD=1 \
     PORT=80
 
 WORKDIR /usr/src

From 73d84c6ee5648def2a6cd77b810edeca9933c1ab Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 15 May 2023 10:35:20 +0200
Subject: [PATCH 32/39] Hotfixes for santacoder/bigcode. (#294)

# What does this PR do?

Hotfixes:

- Uses `model_type`=`gpt_bigcode` for more general usage.
- Hotfixes linked lm_head vs wte_embedding (safetensors file do not
contain the key, correctly when the file is sharded, where as pytorch
copies the tensor)


<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-41-161.ec2.internal>
Co-authored-by: OlivierDehaene <olivier@huggingface.co>
---
 server/text_generation_server/models/__init__.py    | 13 ++++++++++++-
 .../models/flash_santacoder.py                      |  3 +++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index e02be3de..ec990fde 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -99,7 +99,7 @@ def get_model(
         else:
             return Galactica(model_id, revision, quantize=quantize)
 
-    if "bigcode" in model_id:
+    if model_id.startswith("bigcode/"):
         if sharded:
             if not FLASH_ATTENTION:
                 raise NotImplementedError(
@@ -113,6 +113,17 @@ def get_model(
     config = AutoConfig.from_pretrained(model_id, revision=revision)
     model_type = config.model_type
 
+    if model_type == "gpt_bigcode":
+        if sharded:
+            if not FLASH_ATTENTION:
+                raise NotImplementedError(
+                    FLASH_ATT_ERROR_MESSAGE.format(f"Sharded Santacoder")
+                )
+            return FlashSantacoderSharded(model_id, revision, quantize=quantize)
+        else:
+            santacoder_cls = FlashSantacoder if FLASH_ATTENTION else SantaCoder
+            return santacoder_cls(model_id, revision, quantize=quantize)
+
     if model_type == "bloom":
         if sharded:
             return BLOOMSharded(model_id, revision, quantize=quantize)
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index c463ee98..afe4eba5 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -376,6 +376,9 @@ class FlashSantacoderSharded(FlashSantacoder):
                     else:
                         module._buffers[param_name] = tensor
 
+
+        model.lm_head.weight = torch.nn.Parameter(model.transformer.wte.weight)
+
         uninitialized_parameters = []
         for n, p in model.named_parameters():
             if p.data.device == torch.device("meta"):

From 91e674bb85760a19afb509cc0010d46b090183fd Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 15 May 2023 11:32:25 +0200
Subject: [PATCH 33/39] Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 server/text_generation_server/models/bloom.py  |  9 ---------
 .../models/flash_llama.py                      | 18 ------------------
 .../models/flash_neox.py                       |  9 ---------
 .../models/flash_santacoder.py                 | 11 -----------
 .../text_generation_server/models/galactica.py |  9 ---------
 .../text_generation_server/models/gpt_neox.py  |  9 ---------
 server/text_generation_server/models/model.py  | 11 +++++++++++
 server/text_generation_server/models/opt.py    |  9 ---------
 server/text_generation_server/models/t5.py     |  9 ---------
 9 files changed, 11 insertions(+), 83 deletions(-)

diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 877acb00..f6a69031 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -238,15 +238,6 @@ class BLOOMSharded(BLOOM):
                     if name == "word_embeddings.weight":
                         model.lm_head._parameters["weight"] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 5f47cf66..0b63f904 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -139,15 +139,6 @@ class FlashLlama(FlashCausalLM):
 
                 del value
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
         torch.cuda.empty_cache()
         model.post_load_weights(quantize)
 
@@ -315,14 +306,5 @@ class FlashLlamaSharded(FlashLlama):
                     else:
                         module._buffers[param_name] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
         torch.cuda.empty_cache()
         model.post_load_weights(quantize)
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index b3e1876f..168c9195 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -152,13 +152,4 @@ class FlashNeoXSharded(FlashNeoX):
                     else:
                         module._buffers[param_name] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
         model.post_load_weights(quantize)
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index afe4eba5..51a8998b 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -376,17 +376,6 @@ class FlashSantacoderSharded(FlashSantacoder):
                     else:
                         module._buffers[param_name] = tensor
 
-
         model.lm_head.weight = torch.nn.Parameter(model.transformer.wte.weight)
-
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
         torch.cuda.empty_cache()
         model.post_load_weights(quantize)
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index 4f94b348..c6dd4c33 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -365,15 +365,6 @@ class GalacticaSharded(Galactica):
                     if name == "model.decoder.embed_tokens.weight":
                         model.lm_head._parameters["weight"] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index 2d42e0b0..215bb2b6 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -215,15 +215,6 @@ class GPTNeoxSharded(CausalLM):
                     else:
                         module._buffers[param_name] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 4c85c952..03f14013 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -32,6 +32,7 @@ class Model(ABC):
         self.decode_buffer = decode_buffer
         self.rank = rank
         self.world_size = world_size
+        self.check_initialized()
 
     @property
     def info(self) -> InfoResponse:
@@ -99,3 +100,13 @@ class Model(ABC):
             return token_text, None, None
         else:
             return "", offset, token_offset
+
+    def check_initialized(self):
+        uninitialized_parameters = []
+        for n, p in self.model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
+            )
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index 44f15df3..8d856b10 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -212,15 +212,6 @@ class OPTSharded(OPT):
                     if name == "model.decoder.embed_tokens.weight":
                         model.lm_head._parameters["weight"] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 381617b7..b5e7710d 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -222,15 +222,6 @@ class T5Sharded(Seq2SeqLM):
                     else:
                         module._buffers[param_name] = tensor
 
-        uninitialized_parameters = []
-        for n, p in model.named_parameters():
-            if p.data.device == torch.device("meta"):
-                uninitialized_parameters.append(n)
-        if uninitialized_parameters:
-            raise RuntimeError(
-                f"found uninitialized parameters in model: {uninitialized_parameters}"
-            )
-
     def forward(
         self,
         input_ids,

From d7a97aa0b60fcf2a861ccd2ffb40e34cdf2c7650 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 15 May 2023 15:14:17 +0200
Subject: [PATCH 34/39] Removing dead variables. (#327)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 server/text_generation_server/models/bloom.py            | 1 -
 server/text_generation_server/models/flash_llama.py      | 3 ---
 server/text_generation_server/models/flash_neox.py       | 2 --
 server/text_generation_server/models/flash_santacoder.py | 3 ---
 server/text_generation_server/models/galactica.py        | 1 -
 server/text_generation_server/models/gpt_neox.py         | 1 -
 server/text_generation_server/models/opt.py              | 1 -
 server/text_generation_server/models/t5.py               | 1 -
 8 files changed, 13 deletions(-)

diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index f6a69031..ed959291 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -72,7 +72,6 @@ class BLOOMSharded(BLOOM):
         quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 0b63f904..aa0b4483 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -29,7 +29,6 @@ tracer = trace.get_tracer(__name__)
 
 class FlashLlama(FlashCausalLM):
     def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
-        self.past_pad = None
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16
@@ -150,9 +149,7 @@ class FlashLlamaSharded(FlashLlama):
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
     ):
-        self.past_pad = None
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index 168c9195..fc741f55 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -33,9 +33,7 @@ class FlashNeoXSharded(FlashNeoX):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.past_pad = None
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 51a8998b..f810bb0b 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -28,7 +28,6 @@ tracer = trace.get_tracer(__name__)
 
 class FlashSantacoder(FlashCausalLM):
     def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
-        self.past_pad = None
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16
@@ -173,9 +172,7 @@ class FlashSantacoderSharded(FlashSantacoder):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        self.past_pad = None
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index c6dd4c33..a0111250 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -199,7 +199,6 @@ class GalacticaSharded(Galactica):
         quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index 215bb2b6..3e8557b2 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -38,7 +38,6 @@ class GPTNeoxSharded(CausalLM):
         quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index 8d856b10..c83c3351 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -51,7 +51,6 @@ class OPTSharded(OPT):
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index b5e7710d..6fe77ca2 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -38,7 +38,6 @@ class T5Sharded(Seq2SeqLM):
         quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
-        self.master = rank == 0
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16

From 66b277321d738ea6f0703b485ecefb61fb3f5441 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Mon, 15 May 2023 15:53:08 +0200
Subject: [PATCH 35/39] feat(ci): custom gpu runners (#328)

---
 .github/workflows/build.yaml | 100 +++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 289e4f67..0a99fb52 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -25,8 +25,44 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+      EC2_AMI_ID: ami-03cfed9ea28f4b002
+      EC2_INSTANCE_TYPE: g5.12xlarge
+      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
+      EC2_SECURITY_GROUP: sg-04d472c808f365022
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ env.EC2_AMI_ID }}
+          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
+          subnet-id: ${{ env.EC2_SUBNET_ID }}
+          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
+          aws-resource-tags: > # optional, requires additional permissions
+            [
+              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
   build-and-push-image:
-    runs-on: large-runner
+    needs: start-runner # required to start the main job when the runner is ready
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
     permissions:
       contents: write
       packages: write
@@ -49,7 +85,7 @@ jobs:
         with:
           cosign-release: 'v1.13.1'
       - name: Tailscale
-        uses: tailscale/github-action@v1
+        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
         with:
           authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
       - name: Login to GitHub Container Registry
@@ -136,52 +172,26 @@ jobs:
         with:
           sarif_file: 'trivy-results.sarif'
 
-  build-and-push-sagemaker-image:
+  stop-runner:
+    name: Stop self-hosted EC2 runner
     needs:
+      - start-runner
       - build-and-push-image
     runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Initialize Docker Buildx
-        uses: docker/setup-buildx-action@v2.0.0
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
         with:
-          install: true
-      - name: Inject slug/short variables
-        uses: rlespinasse/github-slug-action@v4.4.1
-      - name: Tailscale
-        uses: tailscale/github-action@v1
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: philschmid/philschmid-ec2-github-runner@main
         with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
-      - name: Login to internal Container Registry
-        uses: docker/login-action@v2.1.0
-        with:
-          username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
-          password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
-          registry: registry.internal.huggingface.tech
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4.3.0
-        with:
-          flavor: |
-            latest=auto
-          images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference/sagemaker
-          tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v2
-        with:
-          context: .
-          file: Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
-          platforms: 'linux/amd64'
-          build-args: |
-            GIT_SHA={{ env.GITHUB_SHA }}
-          target: sagemaker
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
\ No newline at end of file
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
\ No newline at end of file

From f58f0a03647ce730f39404ecd50fb909db1bcfe0 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 15 May 2023 17:30:47 +0200
Subject: [PATCH 36/39] Single place for TP layers + Dropout Layer Norm +
 FastLinear (#329)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 .../custom_modeling/flash_llama_modeling.py   | 225 +--------------
 .../custom_modeling/flash_neox_modeling.py    | 264 +----------------
 .../flash_santacoder_modeling.py              | 225 +--------------
 server/text_generation_server/models/t5.py    |  15 +-
 server/text_generation_server/utils/layers.py | 272 ++++++++++++++++++
 5 files changed, 303 insertions(+), 698 deletions(-)
 create mode 100644 server/text_generation_server/utils/layers.py

diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 5b32b22e..11f3766e 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -28,17 +28,16 @@ from transformers.activations import ACT2FN
 from typing import Optional
 
 # Flash attention imports
-import rotary_emb
 import flash_attn_cuda
-import dropout_layer_norm
 
 from flash_attn.layers.rotary import RotaryEmbedding
-
-HAS_BITS_AND_BYTES = True
-try:
-    from bitsandbytes.nn import Linear8bitLt
-except ImportError as e:
-    HAS_BITS_AND_BYTES = False
+from text_generation_server.utils.layers import (
+    FastLinear,
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    PositionRotaryEmbedding,
+)
 
 
 class LlamaRMSNorm(nn.Module):
@@ -91,216 +90,6 @@ class LlamaRMSNorm(nn.Module):
             return normed_hidden_states, res
 
 
-class FastLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
-        self.quantized = False
-        self.bnb_linear = None
-
-    def prepare_weights(self, quantize: bool = False):
-        if quantize == "bitsandbytes":
-            if not HAS_BITS_AND_BYTES:
-                raise ImportError(
-                    "bitsandbytes is not available on your machine either because it is not installed "
-                    "or you don't have a GPU.\n"
-                    "You can install it with `pip install bitsandbytes`."
-                )
-
-            self.quantized = True
-            self.bnb_linear = Linear8bitLt(
-                self.in_features,
-                self.out_features,
-                has_fp16_weights=False,
-                threshold=6.0,
-                bias=False,
-            )
-            # Copy data to bnb_linear
-            self.bnb_linear.weight.data = self.weight.data
-            if self.bias is not None:
-                self.bnb_linear.bias = nn.Parameter(self.bias)
-
-            # Delete reference to data
-            self.weight = None
-            self.bias = None
-        elif quantize == "gptq":
-            raise NotImplementedError("`gptq` is not implemented for now")
-        elif quantize is None:
-            self.weight = nn.Parameter(self.weight.T)
-        else:
-            raise ValueError(f"Unexpected quantize `{quantize}`")
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if self.quantized:
-            return self.bnb_linear(input)
-        else:
-            if self.bias is not None:
-                return torch.addmm(self.bias, input, self.weight)
-            return torch.matmul(input, self.weight)
-
-
-class TensorParallelColumnLinear(FastLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        process_group: torch.distributed.ProcessGroup,
-        bias=True,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_world_size = process_group.size()
-        assert out_features % self.tp_world_size == 0
-        out_features = out_features // self.tp_world_size
-
-        super().__init__(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-
-
-class TensorParallelRowLinear(FastLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        process_group: torch.distributed.ProcessGroup,
-        reduce=True,
-        bias=True,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_world_size = process_group.size()
-        self.reduce = reduce
-        assert in_features % self.tp_world_size == 0
-        in_features = in_features // self.tp_world_size
-
-        super().__init__(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        out = super(TensorParallelRowLinear, self).forward(input)
-        if self.reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-
-        return out
-
-
-class TensorParallelEmbedding(nn.Embedding):
-    def __init__(
-        self,
-        num_embeddings,
-        embedding_dim,
-        process_group: torch.distributed.ProcessGroup,
-        padding_idx=None,
-        max_norm=None,
-        norm_type=2.0,
-        scale_grad_by_freq=False,
-        sparse=False,
-        _weight=None,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_rank = process_group.rank()
-        self.tp_world_size = process_group.size()
-
-        self.original_num_embeddings = num_embeddings
-
-        assert num_embeddings % self.tp_world_size == 0
-        block_size = num_embeddings // self.tp_world_size
-        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
-        self.min_id = self.tp_rank * block_size
-        self.max_id = (self.tp_rank + 1) * block_size
-
-        # Additional entry that will map to zero
-        # Used for masking
-        self.null_idx = block_size
-
-        super().__init__(
-            block_size,
-            embedding_dim,
-            padding_idx=padding_idx,
-            max_norm=max_norm,
-            norm_type=norm_type,
-            scale_grad_by_freq=scale_grad_by_freq,
-            sparse=sparse,
-            _weight=_weight,
-            device=device,
-            dtype=dtype,
-        )
-
-    def add_null_idx(self):
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = super().forward(input)
-        torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-class PositionRotaryEmbedding(RotaryEmbedding):
-    def _update_cos_sin_cache(self, dtype, device, seqlen):
-        # Reset the tables if the sequence length has changed,
-        # or if we're on a new device (possibly due to tracing for instance)
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-        ):
-            self._seq_len_cached = seqlen
-            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-            self._cos_cached = torch.cos(freqs).to(dtype)
-            self._sin_cached = torch.sin(freqs).to(dtype)
-
-    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
-        """
-        Return cos and sin for the asked position ids
-        """
-
-        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
-
-        cos = torch.index_select(self._cos_cached, 0, position_ids)
-        sin = torch.index_select(self._sin_cached, 0, position_ids)
-        return cos.unsqueeze(1), sin.unsqueeze(1)
-
-    def forward(self, qkv: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
-        rotary_dim = cos.shape[-1]
-        q1 = qkv[:, 0, :, :rotary_dim]
-        q2 = qkv[:, 0, :, rotary_dim : 2 * rotary_dim]
-        k1 = qkv[:, 1, :, :rotary_dim]
-        k2 = qkv[:, 1, :, rotary_dim : 2 * rotary_dim]
-
-        rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-        rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        return qkv
-
-
 class FlashLlamaAttention(torch.nn.Module):
     def __init__(
         self,
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index e7b878c0..369e8d4f 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -30,265 +30,17 @@ from transformers.models.gpt_neox import GPTNeoXConfig
 from typing import Optional
 
 # Flash attention imports
-import rotary_emb
 import flash_attn_cuda
-import dropout_layer_norm
 
 from flash_attn.layers.rotary import RotaryEmbedding
-
-HAS_BITS_AND_BYTES = True
-try:
-    from bitsandbytes.nn import Linear8bitLt
-except ImportError as e:
-    HAS_BITS_AND_BYTES = False
-
-
-class FastLayerNorm(nn.LayerNorm):
-    def forward(self, hidden_states, residual=None):
-        if hidden_states.shape[-1] > 8192:
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            return super(FastLayerNorm, self).forward(hidden_states), residual
-        else:
-            (
-                normed_hidden_states,
-                residual,
-                *rest,
-            ) = dropout_layer_norm.dropout_add_ln_fwd(
-                hidden_states,
-                residual,
-                self.weight,
-                self.bias,
-                None,
-                None,
-                None,
-                None,
-                0.0,
-                self.eps,
-                1.0,
-                0,
-                None,
-                False,
-                False,
-            )
-            if residual is None:
-                residual = hidden_states
-
-            return normed_hidden_states, residual
-
-
-class FastLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
-        self.quantized = False
-        self.bnb_linear = None
-
-    def prepare_weights(self, quantize: Optional[str] = None):
-        if quantize == "bitsandbytes":
-            if not HAS_BITS_AND_BYTES:
-                raise ImportError(
-                    "bitsandbytes is not available on your machine either because it is not installed "
-                    "or you don't have a GPU.\n"
-                    "You can install it with `pip install bitsandbytes`."
-                )
-
-            self.quantized = True
-            self.bnb_linear = Linear8bitLt(
-                self.in_features,
-                self.out_features,
-                has_fp16_weights=False,
-                threshold=6.0,
-                bias=False,
-            )
-            # Copy data to bnb_linear
-            self.bnb_linear.weight.data = self.weight.data
-            if self.bias is not None:
-                self.bnb_linear.bias = nn.Parameter(self.bias)
-
-            # Delete reference to data
-            self.weight = None
-            self.bias = None
-        elif quantize == "gptq":
-            raise NotImplementedError("`gptq` is not implemented for now")
-        elif quantize is None:
-            self.weight = nn.Parameter(self.weight.T)
-        else:
-            raise ValueError(f"Unexpected quantize `{quantize}`")
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if self.quantized:
-            return self.bnb_linear(input)
-        else:
-            if self.bias is not None:
-                return torch.addmm(self.bias, input, self.weight)
-            return torch.matmul(input, self.weight)
-
-
-class TensorParallelColumnLinear(FastLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        process_group: torch.distributed.ProcessGroup,
-        bias=True,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_world_size = process_group.size()
-        assert out_features % self.tp_world_size == 0
-        out_features = out_features // self.tp_world_size
-
-        super().__init__(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-
-
-class TensorParallelRowLinear(FastLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        process_group: torch.distributed.ProcessGroup,
-        reduce=True,
-        bias=True,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_world_size = process_group.size()
-        self.reduce = reduce
-        assert in_features % self.tp_world_size == 0
-        in_features = in_features // self.tp_world_size
-
-        super().__init__(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        out = super(TensorParallelRowLinear, self).forward(input)
-        if self.reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-
-        return out
-
-
-class TensorParallelEmbedding(nn.Embedding):
-    def __init__(
-        self,
-        num_embeddings,
-        embedding_dim,
-        process_group: torch.distributed.ProcessGroup,
-        padding_idx=None,
-        max_norm=None,
-        norm_type=2.0,
-        scale_grad_by_freq=False,
-        sparse=False,
-        _weight=None,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_rank = process_group.rank()
-        self.tp_world_size = process_group.size()
-
-        self.original_num_embeddings = num_embeddings
-
-        assert num_embeddings % self.tp_world_size == 0
-        block_size = num_embeddings // self.tp_world_size
-        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
-        self.min_id = self.tp_rank * block_size
-        self.max_id = (self.tp_rank + 1) * block_size
-
-        # Additional entry that will map to zero
-        # Used for masking
-        self.null_idx = block_size
-
-        super().__init__(
-            block_size,
-            embedding_dim,
-            padding_idx=padding_idx,
-            max_norm=max_norm,
-            norm_type=norm_type,
-            scale_grad_by_freq=scale_grad_by_freq,
-            sparse=sparse,
-            _weight=_weight,
-            device=device,
-            dtype=dtype,
-        )
-
-    def add_null_idx(self):
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = super().forward(input)
-        torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-class PositionRotaryEmbedding(RotaryEmbedding):
-    def _update_cos_sin_cache(self, dtype, device, seqlen):
-        # Reset the tables if the sequence length has changed,
-        # or if we're on a new device (possibly due to tracing for instance)
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached.device != device
-            or self._cos_cached.dtype != dtype
-        ):
-            self._seq_len_cached = seqlen
-            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-            # Don't do einsum, it converts fp32 to fp16
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-            self._cos_cached = torch.cos(freqs).to(dtype)
-            self._sin_cached = torch.sin(freqs).to(dtype)
-
-    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
-        """
-        Return cos and sin for the asked position ids
-        """
-
-        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
-
-        cos = torch.index_select(self._cos_cached, 0, position_ids)
-        sin = torch.index_select(self._sin_cached, 0, position_ids)
-        return cos.unsqueeze(1), sin.unsqueeze(1)
-
-    def forward(self, qkv: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
-        rotary_dim = cos.shape[-1]
-        q1 = qkv[:, 0, :, :rotary_dim]
-        q2 = qkv[:, 0, :, rotary_dim : 2 * rotary_dim]
-        k1 = qkv[:, 1, :, :rotary_dim]
-        k2 = qkv[:, 1, :, rotary_dim : 2 * rotary_dim]
-
-        rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-        rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        return qkv
+from text_generation_server.utils.layers import (
+    FastLinear,
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    FastLayerNorm,
+    PositionRotaryEmbedding,
+)
 
 
 class FlashNeoxAttention(torch.nn.Module):
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 23c3ea28..9451b01a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -9,224 +9,13 @@ from typing import Optional
 
 # Flash attention imports
 import flash_attn_cuda
-import dropout_layer_norm
-
-HAS_BITS_AND_BYTES = True
-try:
-    from bitsandbytes.nn import Linear8bitLt
-except ImportError as e:
-    HAS_BITS_AND_BYTES = False
-
-
-class FastLayerNorm(nn.LayerNorm):
-    def forward(self, hidden_states, residual=None):
-        if hidden_states.shape[-1] > 8192:
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            return super(FastLayerNorm, self).forward(hidden_states), residual
-        else:
-            (
-                normed_hidden_states,
-                residual,
-                *rest,
-            ) = dropout_layer_norm.dropout_add_ln_fwd(
-                hidden_states,
-                residual,
-                self.weight,
-                self.bias,
-                None,
-                None,
-                None,
-                None,
-                0.0,
-                self.eps,
-                1.0,
-                0,
-                None,
-                False,
-                False,
-            )
-            if residual is None:
-                residual = hidden_states
-
-            return normed_hidden_states, residual
-
-
-class FastLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
-        self.quantized = False
-        self.bnb_linear = None
-
-    def prepare_weights(self, quantize: Optional[str] = None):
-        if quantize == "bitsandbytes":
-            if not HAS_BITS_AND_BYTES:
-                raise ImportError(
-                    "bitsandbytes is not available on your machine either because it is not installed "
-                    "or you don't have a GPU.\n"
-                    "You can install it with `pip install bitsandbytes`."
-                )
-
-            self.quantized = True
-            self.bnb_linear = Linear8bitLt(
-                self.in_features,
-                self.out_features,
-                has_fp16_weights=False,
-                threshold=6.0,
-                bias=False,
-            )
-            # Copy data to bnb_linear
-            self.bnb_linear.weight.data = self.weight.data
-            if self.bias is not None:
-                self.bnb_linear.bias = nn.Parameter(self.bias)
-
-            # Delete reference to data
-            self.weight = None
-            self.bias = None
-        elif quantize == "gptq":
-            raise NotImplementedError("`gptq` is not implemented for now")
-        elif quantize is None:
-            self.weight = nn.Parameter(self.weight.T)
-        else:
-            raise ValueError(f"Unexpected quantize `{quantize}`")
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if self.quantized:
-            return self.bnb_linear(input)
-        else:
-            if self.bias is not None:
-                return torch.addmm(self.bias, input, self.weight)
-            return torch.matmul(input, self.weight)
-
-
-class TensorParallelColumnLinear(FastLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        process_group: torch.distributed.ProcessGroup,
-        bias=True,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_world_size = process_group.size()
-        assert out_features % self.tp_world_size == 0
-        out_features = out_features // self.tp_world_size
-
-        super().__init__(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-
-
-class TensorParallelRowLinear(FastLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        process_group: torch.distributed.ProcessGroup,
-        reduce=True,
-        bias=True,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_world_size = process_group.size()
-        self.reduce = reduce
-        assert in_features % self.tp_world_size == 0
-        in_features = in_features // self.tp_world_size
-
-        super().__init__(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        out = super(TensorParallelRowLinear, self).forward(input)
-        if self.reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-
-        return out
-
-
-class TensorParallelEmbedding(nn.Embedding):
-    def __init__(
-        self,
-        num_embeddings,
-        embedding_dim,
-        process_group: torch.distributed.ProcessGroup,
-        reduce=True,
-        padding_idx=None,
-        max_norm=None,
-        norm_type=2.0,
-        scale_grad_by_freq=False,
-        sparse=False,
-        _weight=None,
-        device=None,
-        dtype=None,
-    ):
-        self.process_group = process_group
-        self.tp_rank = process_group.rank()
-        self.tp_world_size = process_group.size()
-        self.reduce = reduce
-
-        self.original_num_embeddings = num_embeddings
-
-        assert num_embeddings % self.tp_world_size == 0
-        block_size = num_embeddings // self.tp_world_size
-        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
-        self.min_id = self.tp_rank * block_size
-        self.max_id = (self.tp_rank + 1) * block_size
-
-        # Additional entry that will map to zero
-        # Used for masking
-        self.null_idx = block_size
-
-        super().__init__(
-            block_size,
-            embedding_dim,
-            padding_idx=padding_idx,
-            max_norm=max_norm,
-            norm_type=norm_type,
-            scale_grad_by_freq=scale_grad_by_freq,
-            sparse=sparse,
-            _weight=_weight,
-            device=device,
-            dtype=dtype,
-        )
-
-    def add_null_idx(self):
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = super().forward(input)
-        if self.reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
+from text_generation_server.utils.layers import (
+    FastLinear,
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    FastLayerNorm,
+)
 
 
 class FlashMQAttention(torch.nn.Module):
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 6fe77ca2..c8521dbf 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -10,23 +10,26 @@ from transformers import (
     AutoModelForSeq2SeqLM,
     AutoConfig,
 )
-from transformers.models.t5.parallel_layers import (
-    TensorParallelColumnLinear,
-    TensorParallelEmbedding,
-    TensorParallelRowLinear,
-)
 
 from text_generation_server.models import Seq2SeqLM
 from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
 )
+from text_generation_server.utils.layers import (
+    FastLinear,
+)
+from transformers.models.t5.parallel_layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+)
 
 HAS_BITS_AND_BYTES = True
 try:
     import bitsandbytes as bnb
     from bitsandbytes.nn import Int8Params
-except Exception as e:
+except ImportError as e:
     HAS_BITS_AND_BYTES = False
 
 
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
new file mode 100644
index 00000000..3383bf4b
--- /dev/null
+++ b/server/text_generation_server/utils/layers.py
@@ -0,0 +1,272 @@
+import torch
+
+from torch import nn
+
+HAS_BITS_AND_BYTES = True
+try:
+    from bitsandbytes.nn import Linear8bitLt
+except ImportError as e:
+    HAS_BITS_AND_BYTES = False
+
+
+class FastLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super(FastLinear, self).__init__(in_features, out_features, bias, device, dtype)
+        self.quantized = False
+        self.bnb_linear = None
+
+    def prepare_weights(self, quantize: bool = False):
+        if quantize == "bitsandbytes":
+            if not HAS_BITS_AND_BYTES:
+                raise ImportError(
+                    "bitsandbytes is not available on your machine either because it is not installed "
+                    "or you don't have a GPU.\n"
+                    "You can install it with `pip install bitsandbytes`."
+                )
+
+            self.quantized = True
+            self.bnb_linear = Linear8bitLt(
+                self.in_features,
+                self.out_features,
+                has_fp16_weights=False,
+                threshold=6.0,
+                bias=False,
+            )
+            # Copy data to bnb_linear
+            self.bnb_linear.weight.data = self.weight.data
+            if self.bias is not None:
+                self.bnb_linear.bias = nn.Parameter(self.bias)
+
+            # Delete reference to data
+            self.weight = None
+            self.bias = None
+        elif quantize == "gptq":
+            raise NotImplementedError("`gptq` is not implemented for now")
+        elif quantize is None:
+            self.weight = nn.Parameter(self.weight.T)
+        else:
+            raise ValueError(f"Unexpected quantize `{quantize}`")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.quantized:
+            return self.bnb_linear(input)
+        else:
+            if self.bias is not None:
+                return torch.addmm(self.bias, input, self.weight)
+            return torch.matmul(input, self.weight)
+
+
+class TensorParallelColumnLinear(FastLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        process_group: torch.distributed.ProcessGroup,
+        bias=True,
+        device=None,
+        dtype=None,
+    ):
+        self.process_group = process_group
+        self.tp_world_size = process_group.size()
+        assert out_features % self.tp_world_size == 0
+        out_features = out_features // self.tp_world_size
+
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+
+
+class TensorParallelRowLinear(FastLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        process_group: torch.distributed.ProcessGroup,
+        reduce=True,
+        bias=True,
+        device=None,
+        dtype=None,
+    ):
+        self.process_group = process_group
+        self.tp_world_size = process_group.size()
+        self.reduce = reduce
+        assert in_features % self.tp_world_size == 0
+        in_features = in_features // self.tp_world_size
+
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        out = super(TensorParallelRowLinear, self).forward(input)
+        if self.reduce:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out
+
+
+class TensorParallelEmbedding(nn.Embedding):
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        process_group: torch.distributed.ProcessGroup,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+        device=None,
+        dtype=None,
+    ):
+        self.process_group = process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+
+        self.original_num_embeddings = num_embeddings
+
+        assert num_embeddings % self.tp_world_size == 0
+        block_size = num_embeddings // self.tp_world_size
+        # inputs in `[min_id, max_id[` are handled by `self` to get embeddings
+        self.min_id = self.tp_rank * block_size
+        self.max_id = (self.tp_rank + 1) * block_size
+
+        # Additional entry that will map to zero
+        # Used for masking
+        self.null_idx = block_size
+
+        super().__init__(
+            block_size,
+            embedding_dim,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            sparse=sparse,
+            _weight=_weight,
+            device=device,
+            dtype=dtype,
+        )
+
+    def add_null_idx(self):
+        """Additional 0 entry used for masking"""
+        self.weight = nn.Parameter(F.pad(self.weight, (0, 0, 0, 1)))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = super().forward(input)
+        torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+
+
+try:
+    import dropout_layer_norm
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if hidden_states.shape[-1] > 8192:
+                if residual is not None:
+                    hidden_states += residual
+                residual = hidden_states
+
+                return super(FastLayerNorm, self).forward(hidden_states), residual
+            else:
+                (
+                    normed_hidden_states,
+                    residual,
+                    *rest,
+                ) = dropout_layer_norm.dropout_add_ln_fwd(
+                    hidden_states,
+                    residual,
+                    self.weight,
+                    self.bias,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0.0,
+                    self.eps,
+                    1.0,
+                    0,
+                    None,
+                    False,
+                    False,
+                )
+                if residual is None:
+                    residual = hidden_states
+
+                return normed_hidden_states, residual
+
+except ImportError:
+    pass
+
+
+try:
+    from flash_attn.layers.rotary import RotaryEmbedding
+    import rotary_emb
+
+    class PositionRotaryEmbedding(RotaryEmbedding):
+        def _update_cos_sin_cache(self, dtype, device, seqlen):
+            # Reset the tables if the sequence length has changed,
+            # or if we're on a new device (possibly due to tracing for instance)
+            if (
+                seqlen > self._seq_len_cached
+                or self._cos_cached.device != device
+                or self._cos_cached.dtype != dtype
+            ):
+                self._seq_len_cached = seqlen
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                # Don't do einsum, it converts fp32 to fp16
+                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+
+        def get_cos_sin(
+            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
+        ):
+            """
+            Return cos and sin for the asked position ids
+            """
+
+            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+
+            cos = torch.index_select(self._cos_cached, 0, position_ids)
+            sin = torch.index_select(self._sin_cached, 0, position_ids)
+            return cos.unsqueeze(1), sin.unsqueeze(1)
+
+        def forward(self, qkv: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
+            rotary_dim = cos.shape[-1]
+            q1 = qkv[:, 0, :, :rotary_dim]
+            q2 = qkv[:, 0, :, rotary_dim : 2 * rotary_dim]
+            k1 = qkv[:, 1, :, :rotary_dim]
+            k2 = qkv[:, 1, :, rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+            return qkv
+
+except ImportError:
+    pass

From e71471bec95823ef69daaeb03c4657b9b5211a02 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Mon, 15 May 2023 23:36:30 +0200
Subject: [PATCH 37/39] feat: add snapshot testing (#282)

---
 .github/workflows/build.yaml                  |  62 +-
 Makefile                                      |  11 +-
 README.md                                     |   2 +
 integration-tests/conftest.py                 | 146 ++++
 .../models/__snapshots__/test_bloom_560m.ambr | 627 ++++++++++++++++
 .../test_bloom_560m_sharded.ambr              | 542 ++++++++++++++
 .../__snapshots__/test_flash_llama.ambr       | 465 ++++++++++++
 .../models/__snapshots__/test_flash_neox.ambr | 682 ++++++++++++++++++
 .../__snapshots__/test_flash_santacoder.ambr  | 472 ++++++++++++
 .../__snapshots__/test_flash_starcoder.ambr   | 573 +++++++++++++++
 .../models/__snapshots__/test_mt0_base.ambr   | 306 ++++++++
 integration-tests/models/test_bloom_560m.py   |  63 ++
 .../models/test_bloom_560m_sharded.py         |  42 ++
 integration-tests/models/test_flash_llama.py  |  56 ++
 integration-tests/models/test_flash_neox.py   |  38 +
 .../models/test_flash_santacoder.py           |  32 +
 .../models/test_flash_starcoder.py            |  47 ++
 integration-tests/models/test_mt0_base.py     |  63 ++
 integration-tests/models/utils.py             |  15 +
 integration-tests/requirements.txt            |   5 +
 launcher/tests/bloom_560m.json                | 142 ----
 launcher/tests/integration_tests.rs           | 172 -----
 launcher/tests/mt0_base.json                  | 137 ----
 server/text_generation_server/models/bloom.py |   2 +-
 .../custom_modeling/flash_llama_modeling.py   |  18 +-
 .../custom_modeling/flash_neox_modeling.py    |  17 +-
 .../flash_santacoder_modeling.py              |  16 +-
 .../models/flash_llama.py                     |  15 +-
 .../models/flash_neox.py                      |  14 +-
 .../models/flash_santacoder.py                |  18 +-
 .../models/galactica.py                       |   2 +-
 .../text_generation_server/models/gpt_neox.py |   2 +-
 server/text_generation_server/models/opt.py   |   7 +-
 server/text_generation_server/models/t5.py    |   2 +-
 server/text_generation_server/utils/layers.py |   9 +-
 35 files changed, 4313 insertions(+), 509 deletions(-)
 create mode 100644 integration-tests/conftest.py
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_llama.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_neox.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_santacoder.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_mt0_base.ambr
 create mode 100644 integration-tests/models/test_bloom_560m.py
 create mode 100644 integration-tests/models/test_bloom_560m_sharded.py
 create mode 100644 integration-tests/models/test_flash_llama.py
 create mode 100644 integration-tests/models/test_flash_neox.py
 create mode 100644 integration-tests/models/test_flash_santacoder.py
 create mode 100644 integration-tests/models/test_flash_starcoder.py
 create mode 100644 integration-tests/models/test_mt0_base.py
 create mode 100644 integration-tests/models/utils.py
 create mode 100644 integration-tests/requirements.txt
 delete mode 100644 launcher/tests/bloom_560m.json
 delete mode 100644 launcher/tests/integration_tests.rs
 delete mode 100644 launcher/tests/mt0_base.json

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0a99fb52..c2aba160 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -20,10 +20,6 @@ on:
     branches:
       - 'main'
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
   start-runner:
     name: Start self-hosted EC2 runner
@@ -61,6 +57,9 @@ jobs:
             ]
 
   build-and-push-image:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
     needs: start-runner # required to start the main job when the runner is ready
     runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
     permissions:
@@ -108,7 +107,19 @@ jobs:
           username: ${{ secrets.AZURE_DOCKER_USERNAME }}
           password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
           registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
+      # If pull request
       - name: Extract metadata (tags, labels) for Docker
+        if: ${{ github.event_name == 'pull_request' }}
+        id: meta-pr
+        uses: docker/metadata-action@v4.3.0
+        with:
+          images: |
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+          tags: |
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+      # If main, release or tag
+      - name: Extract metadata (tags, labels) for Docker
+        if: ${{ github.event_name != 'pull_request' }}
         id: meta
         uses: docker/metadata-action@v4.3.0
         with:
@@ -129,13 +140,13 @@ jobs:
         with:
           context: .
           file: Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
+          push: true
           platforms: 'linux/amd64'
           build-args: |
             GIT_SHA=${{ env.GITHUB_SHA }}
             DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
       # Sign the resulting Docker image digest except on PRs.
@@ -172,11 +183,48 @@ jobs:
         with:
           sarif_file: 'trivy-results.sarif'
 
+  integration-tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    needs:
+      - start-runner
+      - build-and-push-image # Wait for the docker image to be built
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    env:
+      DOCKER_VOLUME: /cache
+    steps:
+      - uses: actions/checkout@v2
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Tailscale
+        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
+        with:
+          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+      - name: Prepare disks
+        run: |
+          sudo mkfs -t ext4 /dev/nvme1n1
+          sudo mkdir ${{ env.DOCKER_VOLUME }}
+          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
+          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          pytest -s -vv integration-tests
+
   stop-runner:
     name: Stop self-hosted EC2 runner
     needs:
       - start-runner
       - build-and-push-image
+      - integration-tests
     runs-on: ubuntu-latest
     env:
       AWS_REGION: us-east-1
diff --git a/Makefile b/Makefile
index 032a49de..29c318fa 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
 install-server:
 	cd server && make install
 
+install-integration-tests:
+	cd integration-tests && pip install -r requirements.txt
+
 install-router:
 	cd router && cargo install --path .
 
@@ -18,9 +21,15 @@ server-dev:
 router-dev:
 	cd router && cargo run -- --port 8080
 
-integration-tests: install-router install-launcher
+rust-tests: install-router install-launcher
 	cargo test
 
+integration-tests: install-integration-tests
+	pytest -s -vv -m "not private" integration-tests
+
+update-integration-tests: install-integration-tests
+	pytest -s -vv --snapshot-update integration-tests
+
 python-server-tests:
 	HF_HUB_ENABLE_HF_TRANSFER=1 pytest server/tests
 
diff --git a/README.md b/README.md
index 756d7e35..918ea5e2 100644
--- a/README.md
+++ b/README.md
@@ -253,5 +253,7 @@ make python-client-tests
 # or both server and client tests
 make python-tests
 # rust cargo tests
+make rust-tests
+# integration tests
 make integration-tests
 ```
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
new file mode 100644
index 00000000..e9c51c37
--- /dev/null
+++ b/integration-tests/conftest.py
@@ -0,0 +1,146 @@
+import subprocess
+import contextlib
+import pytest
+import asyncio
+import os
+import docker
+
+from docker.errors import NotFound
+from typing import Optional, List
+from syrupy.filters import props
+
+from text_generation import AsyncClient
+from text_generation.types import Response
+
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
+HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+
+
+@pytest.fixture
+def snapshot_test(snapshot):
+    return lambda value: value == snapshot(exclude=props("logprob"))
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    loop = asyncio.get_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+def launcher(event_loop):
+    @contextlib.contextmanager
+    def local_launcher(
+        model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
+    ):
+        port = 9999
+        master_port = 19999
+
+        shard_uds_path = f"/tmp/{model_id.replace('/', '--')}-server"
+
+        args = [
+            "text-generation-launcher",
+            "--model-id",
+            model_id,
+            "--port",
+            str(port),
+            "--master-port",
+            str(master_port),
+            "--shard-uds-path",
+            shard_uds_path,
+        ]
+
+        if num_shard is not None:
+            args.extend(["--num-shard", str(num_shard)])
+        if quantize:
+            args.append("--quantize")
+
+        with subprocess.Popen(
+            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        ) as process:
+            yield AsyncClient(f"http://localhost:{port}")
+
+            process.terminate()
+            process.wait(60)
+
+            launcher_output = process.stdout.read().decode("utf-8")
+            print(launcher_output)
+
+            process.stdout.close()
+            process.stderr.close()
+
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
+    ):
+        port = 9999
+
+        args = ["--model-id", model_id, "--env"]
+
+        if num_shard is not None:
+            args.extend(["--num-shard", str(num_shard)])
+        if quantize:
+            args.append("--quantize")
+
+        client = docker.from_env()
+
+        container_name = f"tgi-tests-{model_id.split('/')[-1]}-{num_shard}-{quantize}"
+
+        try:
+            container = client.containers.get(container_name)
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        gpu_count = num_shard if num_shard is not None else 1
+
+        env = {}
+        if HUGGING_FACE_HUB_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+
+        volumes = []
+        if DOCKER_VOLUME:
+            volumes = [f"{DOCKER_VOLUME}:/data"]
+
+        container = client.containers.run(
+            DOCKER_IMAGE,
+            command=args,
+            name=container_name,
+            environment=env,
+            auto_remove=True,
+            detach=True,
+            device_requests=[
+                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
+            ],
+            volumes=volumes,
+            ports={"80/tcp": port},
+        )
+
+        yield AsyncClient(f"http://localhost:{port}")
+
+        container.stop()
+
+        container_output = container.logs().decode("utf-8")
+        print(container_output)
+
+    if DOCKER_IMAGE is not None:
+        return docker_launcher
+    return local_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    async def generate_load_inner(
+        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+    ) -> List[Response]:
+        futures = [
+            client.generate(prompt, max_new_tokens=max_new_tokens) for _ in range(n)
+        ]
+
+        results = await asyncio.gather(*futures)
+        return [r.dict() for r in results]
+
+    return generate_load_inner
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m.ambr b/integration-tests/models/__snapshots__/test_bloom_560m.ambr
new file mode 100644
index 00000000..1067513d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m.ambr
@@ -0,0 +1,627 @@
+# serializer version: 1
+# name: test_bloom_560m
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 17934,
+          'text': 'Pour',
+        }),
+        dict({
+          'id': 49833,
+          'text': ' dég',
+        }),
+        dict({
+          'id': 21543,
+          'text': 'uster',
+        }),
+        dict({
+          'id': 447,
+          'text': ' un',
+        }),
+        dict({
+          'id': 46341,
+          'text': ' ort',
+        }),
+        dict({
+          'id': 35567,
+          'text': 'olan',
+        }),
+        dict({
+          'id': 15,
+          'text': ',',
+        }),
+        dict({
+          'id': 1669,
+          'text': ' il',
+        }),
+        dict({
+          'id': 11580,
+          'text': ' faut',
+        }),
+        dict({
+          'id': 3913,
+          'text': ' tout',
+        }),
+        dict({
+          'id': 39261,
+          'text': " d'abord",
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 578,
+          'special': False,
+          'text': ' le',
+        }),
+        dict({
+          'id': 5608,
+          'special': False,
+          'text': ' faire',
+        }),
+        dict({
+          'id': 159570,
+          'special': False,
+          'text': ' réch',
+        }),
+        dict({
+          'id': 810,
+          'special': False,
+          'text': 'au',
+        }),
+        dict({
+          'id': 12736,
+          'special': False,
+          'text': 'ffer',
+        }),
+        dict({
+          'id': 1742,
+          'special': False,
+          'text': ' au',
+        }),
+        dict({
+          'id': 6105,
+          'special': False,
+          'text': ' bain',
+        }),
+        dict({
+          'id': 88254,
+          'special': False,
+          'text': '-mar',
+        }),
+        dict({
+          'id': 641,
+          'special': False,
+          'text': 'ie',
+        }),
+        dict({
+          'id': 2940,
+          'special': False,
+          'text': ' avec',
+        }),
+      ]),
+    }),
+    'generated_text': ' le faire réchauffer au bain-marie avec',
+  })
+# ---
+# name: test_bloom_560m_all_params
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 15,
+          'text': ',',
+        }),
+        dict({
+          'id': 1669,
+          'text': ' il',
+        }),
+        dict({
+          'id': 11580,
+          'text': ' faut',
+        }),
+        dict({
+          'id': 3913,
+          'text': ' tout',
+        }),
+        dict({
+          'id': 39261,
+          'text': " d'abord",
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 408,
+          'special': False,
+          'text': ' que',
+        }),
+        dict({
+          'id': 20288,
+          'special': False,
+          'text': " l'on",
+        }),
+        dict({
+          'id': 22255,
+          'special': False,
+          'text': ' trouve',
+        }),
+        dict({
+          'id': 1622,
+          'special': False,
+          'text': ' une',
+        }),
+        dict({
+          'id': 187079,
+          'special': False,
+          'text': ' posture',
+        }),
+        dict({
+          'id': 501,
+          'special': False,
+          'text': ' par',
+        }),
+        dict({
+          'id': 8741,
+          'special': False,
+          'text': ' rapport',
+        }),
+        dict({
+          'id': 693,
+          'special': False,
+          'text': ' à',
+        }),
+        dict({
+          'id': 366,
+          'special': False,
+          'text': ' la',
+        }),
+        dict({
+          'id': 36503,
+          'special': False,
+          'text': ' pratique',
+        }),
+      ]),
+    }),
+    'generated_text': "Pour déguster un ortolan, il faut tout d'abord que l'on trouve une posture par rapport à la pratique",
+  })
+# ---
+# name: test_bloom_560m_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr b/integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr
new file mode 100644
index 00000000..667a0373
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr
@@ -0,0 +1,542 @@
+# serializer version: 1
+# name: test_bloom_560m_sharded
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 17934,
+          'text': 'Pour',
+        }),
+        dict({
+          'id': 49833,
+          'text': ' dég',
+        }),
+        dict({
+          'id': 21543,
+          'text': 'uster',
+        }),
+        dict({
+          'id': 447,
+          'text': ' un',
+        }),
+        dict({
+          'id': 46341,
+          'text': ' ort',
+        }),
+        dict({
+          'id': 35567,
+          'text': 'olan',
+        }),
+        dict({
+          'id': 15,
+          'text': ',',
+        }),
+        dict({
+          'id': 1669,
+          'text': ' il',
+        }),
+        dict({
+          'id': 11580,
+          'text': ' faut',
+        }),
+        dict({
+          'id': 3913,
+          'text': ' tout',
+        }),
+        dict({
+          'id': 39261,
+          'text': " d'abord",
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 578,
+          'special': False,
+          'text': ' le',
+        }),
+        dict({
+          'id': 5608,
+          'special': False,
+          'text': ' faire',
+        }),
+        dict({
+          'id': 159570,
+          'special': False,
+          'text': ' réch',
+        }),
+        dict({
+          'id': 810,
+          'special': False,
+          'text': 'au',
+        }),
+        dict({
+          'id': 12736,
+          'special': False,
+          'text': 'ffer',
+        }),
+        dict({
+          'id': 1742,
+          'special': False,
+          'text': ' au',
+        }),
+        dict({
+          'id': 6105,
+          'special': False,
+          'text': ' bain',
+        }),
+        dict({
+          'id': 88254,
+          'special': False,
+          'text': '-mar',
+        }),
+        dict({
+          'id': 641,
+          'special': False,
+          'text': 'ie',
+        }),
+        dict({
+          'id': 2940,
+          'special': False,
+          'text': ' avec',
+        }),
+      ]),
+    }),
+    'generated_text': ' le faire réchauffer au bain-marie avec',
+  })
+# ---
+# name: test_bloom_560m_sharded_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 17934,
+            'text': 'Pour',
+          }),
+          dict({
+            'id': 49833,
+            'text': ' dég',
+          }),
+          dict({
+            'id': 21543,
+            'text': 'uster',
+          }),
+          dict({
+            'id': 447,
+            'text': ' un',
+          }),
+          dict({
+            'id': 46341,
+            'text': ' ort',
+          }),
+          dict({
+            'id': 35567,
+            'text': 'olan',
+          }),
+          dict({
+            'id': 15,
+            'text': ',',
+          }),
+          dict({
+            'id': 1669,
+            'text': ' il',
+          }),
+          dict({
+            'id': 11580,
+            'text': ' faut',
+          }),
+          dict({
+            'id': 3913,
+            'text': ' tout',
+          }),
+          dict({
+            'id': 39261,
+            'text': " d'abord",
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 578,
+            'special': False,
+            'text': ' le',
+          }),
+          dict({
+            'id': 5608,
+            'special': False,
+            'text': ' faire',
+          }),
+          dict({
+            'id': 1767,
+            'special': False,
+            'text': ' cu',
+          }),
+          dict({
+            'id': 1273,
+            'special': False,
+            'text': 'ire',
+          }),
+          dict({
+            'id': 1486,
+            'special': False,
+            'text': ' dans',
+          }),
+          dict({
+            'id': 283,
+            'special': False,
+            'text': ' de',
+          }),
+          dict({
+            'id': 40410,
+            'special': False,
+            'text': " l'eau",
+          }),
+          dict({
+            'id': 20226,
+            'special': False,
+            'text': ' bou',
+          }),
+          dict({
+            'id': 172483,
+            'special': False,
+            'text': 'illante',
+          }),
+          dict({
+            'id': 2805,
+            'special': False,
+            'text': ' sal',
+          }),
+        ]),
+      }),
+      'generated_text': " le faire cuire dans de l'eau bouillante sal",
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_llama.ambr b/integration-tests/models/__snapshots__/test_flash_llama.ambr
new file mode 100644
index 00000000..f4e3a4c1
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama.ambr
@@ -0,0 +1,465 @@
+# serializer version: 1
+# name: test_flash_llama
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 1,
+          'text': '<s>',
+        }),
+        dict({
+          'id': 4321,
+          'text': 'Test',
+        }),
+        dict({
+          'id': 2009,
+          'text': 'request',
+        }),
+      ]),
+      'seed': None,
+      'tokens': list([
+        dict({
+          'id': 363,
+          'special': False,
+          'text': ' for',
+        }),
+        dict({
+          'id': 847,
+          'special': False,
+          'text': ' /',
+        }),
+        dict({
+          'id': 2754,
+          'special': False,
+          'text': 'api',
+        }),
+        dict({
+          'id': 29914,
+          'special': False,
+          'text': '/',
+        }),
+        dict({
+          'id': 29894,
+          'special': False,
+          'text': 'v',
+        }),
+        dict({
+          'id': 29896,
+          'special': False,
+          'text': '1',
+        }),
+        dict({
+          'id': 29914,
+          'special': False,
+          'text': '/',
+        }),
+        dict({
+          'id': 16418,
+          'special': False,
+          'text': 'projects',
+        }),
+        dict({
+          'id': 29914,
+          'special': False,
+          'text': '/',
+        }),
+        dict({
+          'id': 29896,
+          'special': False,
+          'text': '1',
+        }),
+      ]),
+    }),
+    'generated_text': 'for /api/v1/projects/1',
+  })
+# ---
+# name: test_flash_llama_all_params
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 1,
+          'text': '<s>',
+        }),
+        dict({
+          'id': 4321,
+          'text': 'Test',
+        }),
+        dict({
+          'id': 2009,
+          'text': 'request',
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 5229,
+          'special': False,
+          'text': ' failed',
+        }),
+        dict({
+          'id': 363,
+          'special': False,
+          'text': ' for',
+        }),
+        dict({
+          'id': 5641,
+          'special': False,
+          'text': ' IP',
+        }),
+        dict({
+          'id': 16428,
+          'special': False,
+          'text': ' Address',
+        }),
+        dict({
+          'id': 29901,
+          'special': False,
+          'text': ':',
+        }),
+        dict({
+          'id': 525,
+          'special': False,
+          'text': " '",
+        }),
+        dict({
+          'id': 8516,
+          'special': False,
+          'text': 'None',
+        }),
+        dict({
+          'id': 4286,
+          'special': False,
+          'text': "'.",
+        }),
+        dict({
+          'id': 13,
+          'special': False,
+          'text': '''
+            
+  
+          ''',
+        }),
+        dict({
+          'id': 294,
+          'special': False,
+          'text': 'as',
+        }),
+      ]),
+    }),
+    'generated_text': '''
+      Test requestfailed for IP Address: 'None'.
+      as
+    ''',
+  })
+# ---
+# name: test_flash_llama_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 1,
+            'text': '<s>',
+          }),
+          dict({
+            'id': 4321,
+            'text': 'Test',
+          }),
+          dict({
+            'id': 2009,
+            'text': 'request',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 363,
+            'special': False,
+            'text': ' for',
+          }),
+          dict({
+            'id': 847,
+            'special': False,
+            'text': ' /',
+          }),
+          dict({
+            'id': 2754,
+            'special': False,
+            'text': 'api',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29894,
+            'special': False,
+            'text': 'v',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 16418,
+            'special': False,
+            'text': 'projects',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+        ]),
+      }),
+      'generated_text': 'for /api/v1/projects/1',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 1,
+            'text': '<s>',
+          }),
+          dict({
+            'id': 4321,
+            'text': 'Test',
+          }),
+          dict({
+            'id': 2009,
+            'text': 'request',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 363,
+            'special': False,
+            'text': ' for',
+          }),
+          dict({
+            'id': 847,
+            'special': False,
+            'text': ' /',
+          }),
+          dict({
+            'id': 2754,
+            'special': False,
+            'text': 'api',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29894,
+            'special': False,
+            'text': 'v',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 16418,
+            'special': False,
+            'text': 'projects',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+        ]),
+      }),
+      'generated_text': 'for /api/v1/projects/1',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 1,
+            'text': '<s>',
+          }),
+          dict({
+            'id': 4321,
+            'text': 'Test',
+          }),
+          dict({
+            'id': 2009,
+            'text': 'request',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 363,
+            'special': False,
+            'text': ' for',
+          }),
+          dict({
+            'id': 847,
+            'special': False,
+            'text': ' /',
+          }),
+          dict({
+            'id': 2754,
+            'special': False,
+            'text': 'api',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29894,
+            'special': False,
+            'text': 'v',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 16418,
+            'special': False,
+            'text': 'projects',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+        ]),
+      }),
+      'generated_text': 'for /api/v1/projects/1',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 1,
+            'text': '<s>',
+          }),
+          dict({
+            'id': 4321,
+            'text': 'Test',
+          }),
+          dict({
+            'id': 2009,
+            'text': 'request',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 363,
+            'special': False,
+            'text': ' for',
+          }),
+          dict({
+            'id': 847,
+            'special': False,
+            'text': ' /',
+          }),
+          dict({
+            'id': 2754,
+            'special': False,
+            'text': 'api',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29894,
+            'special': False,
+            'text': 'v',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 16418,
+            'special': False,
+            'text': 'projects',
+          }),
+          dict({
+            'id': 29914,
+            'special': False,
+            'text': '/',
+          }),
+          dict({
+            'id': 29896,
+            'special': False,
+            'text': '1',
+          }),
+        ]),
+      }),
+      'generated_text': 'for /api/v1/projects/1',
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_neox.ambr b/integration-tests/models/__snapshots__/test_flash_neox.ambr
new file mode 100644
index 00000000..4330db6b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_neox.ambr
@@ -0,0 +1,682 @@
+# serializer version: 1
+# name: test_flash_neox
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 50278,
+          'text': '<|prompter|>',
+        }),
+        dict({
+          'id': 1276,
+          'text': 'What',
+        }),
+        dict({
+          'id': 310,
+          'text': ' is',
+        }),
+        dict({
+          'id': 247,
+          'text': ' a',
+        }),
+        dict({
+          'id': 1167,
+          'text': ' mem',
+        }),
+        dict({
+          'id': 70,
+          'text': 'e',
+        }),
+        dict({
+          'id': 13,
+          'text': ',',
+        }),
+        dict({
+          'id': 285,
+          'text': ' and',
+        }),
+        dict({
+          'id': 752,
+          'text': ' what',
+        }),
+        dict({
+          'id': 434,
+          'text': "'s",
+        }),
+        dict({
+          'id': 253,
+          'text': ' the',
+        }),
+        dict({
+          'id': 2892,
+          'text': ' history',
+        }),
+        dict({
+          'id': 3212,
+          'text': ' behind',
+        }),
+        dict({
+          'id': 436,
+          'text': ' this',
+        }),
+        dict({
+          'id': 3159,
+          'text': ' word',
+        }),
+        dict({
+          'id': 32,
+          'text': '?',
+        }),
+        dict({
+          'id': 0,
+          'text': '<|endoftext|>',
+        }),
+        dict({
+          'id': 50281,
+          'text': '<|assistant|>',
+        }),
+      ]),
+      'seed': None,
+      'tokens': list([
+        dict({
+          'id': 510,
+          'special': False,
+          'text': 'The',
+        }),
+        dict({
+          'id': 3159,
+          'special': False,
+          'text': ' word',
+        }),
+        dict({
+          'id': 346,
+          'special': False,
+          'text': ' "',
+        }),
+        dict({
+          'id': 6441,
+          'special': False,
+          'text': 'mem',
+        }),
+        dict({
+          'id': 70,
+          'special': False,
+          'text': 'e',
+        }),
+        dict({
+          'id': 3,
+          'special': False,
+          'text': '"',
+        }),
+        dict({
+          'id': 369,
+          'special': False,
+          'text': ' was',
+        }),
+        dict({
+          'id': 806,
+          'special': False,
+          'text': ' first',
+        }),
+        dict({
+          'id': 908,
+          'special': False,
+          'text': ' used',
+        }),
+        dict({
+          'id': 275,
+          'special': False,
+          'text': ' in',
+        }),
+      ]),
+    }),
+    'generated_text': 'The word "meme" was first used in',
+  })
+# ---
+# name: test_flash_neox_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 50278,
+            'text': '<|prompter|>',
+          }),
+          dict({
+            'id': 1276,
+            'text': 'What',
+          }),
+          dict({
+            'id': 310,
+            'text': ' is',
+          }),
+          dict({
+            'id': 247,
+            'text': ' a',
+          }),
+          dict({
+            'id': 1167,
+            'text': ' mem',
+          }),
+          dict({
+            'id': 70,
+            'text': 'e',
+          }),
+          dict({
+            'id': 13,
+            'text': ',',
+          }),
+          dict({
+            'id': 285,
+            'text': ' and',
+          }),
+          dict({
+            'id': 752,
+            'text': ' what',
+          }),
+          dict({
+            'id': 434,
+            'text': "'s",
+          }),
+          dict({
+            'id': 253,
+            'text': ' the',
+          }),
+          dict({
+            'id': 2892,
+            'text': ' history',
+          }),
+          dict({
+            'id': 3212,
+            'text': ' behind',
+          }),
+          dict({
+            'id': 436,
+            'text': ' this',
+          }),
+          dict({
+            'id': 3159,
+            'text': ' word',
+          }),
+          dict({
+            'id': 32,
+            'text': '?',
+          }),
+          dict({
+            'id': 0,
+            'text': '<|endoftext|>',
+          }),
+          dict({
+            'id': 50281,
+            'text': '<|assistant|>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 510,
+            'special': False,
+            'text': 'The',
+          }),
+          dict({
+            'id': 3159,
+            'special': False,
+            'text': ' word',
+          }),
+          dict({
+            'id': 346,
+            'special': False,
+            'text': ' "',
+          }),
+          dict({
+            'id': 6441,
+            'special': False,
+            'text': 'mem',
+          }),
+          dict({
+            'id': 70,
+            'special': False,
+            'text': 'e',
+          }),
+          dict({
+            'id': 3,
+            'special': False,
+            'text': '"',
+          }),
+          dict({
+            'id': 369,
+            'special': False,
+            'text': ' was',
+          }),
+          dict({
+            'id': 806,
+            'special': False,
+            'text': ' first',
+          }),
+          dict({
+            'id': 908,
+            'special': False,
+            'text': ' used',
+          }),
+          dict({
+            'id': 275,
+            'special': False,
+            'text': ' in',
+          }),
+        ]),
+      }),
+      'generated_text': 'The word "meme" was first used in',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 50278,
+            'text': '<|prompter|>',
+          }),
+          dict({
+            'id': 1276,
+            'text': 'What',
+          }),
+          dict({
+            'id': 310,
+            'text': ' is',
+          }),
+          dict({
+            'id': 247,
+            'text': ' a',
+          }),
+          dict({
+            'id': 1167,
+            'text': ' mem',
+          }),
+          dict({
+            'id': 70,
+            'text': 'e',
+          }),
+          dict({
+            'id': 13,
+            'text': ',',
+          }),
+          dict({
+            'id': 285,
+            'text': ' and',
+          }),
+          dict({
+            'id': 752,
+            'text': ' what',
+          }),
+          dict({
+            'id': 434,
+            'text': "'s",
+          }),
+          dict({
+            'id': 253,
+            'text': ' the',
+          }),
+          dict({
+            'id': 2892,
+            'text': ' history',
+          }),
+          dict({
+            'id': 3212,
+            'text': ' behind',
+          }),
+          dict({
+            'id': 436,
+            'text': ' this',
+          }),
+          dict({
+            'id': 3159,
+            'text': ' word',
+          }),
+          dict({
+            'id': 32,
+            'text': '?',
+          }),
+          dict({
+            'id': 0,
+            'text': '<|endoftext|>',
+          }),
+          dict({
+            'id': 50281,
+            'text': '<|assistant|>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 510,
+            'special': False,
+            'text': 'The',
+          }),
+          dict({
+            'id': 3159,
+            'special': False,
+            'text': ' word',
+          }),
+          dict({
+            'id': 346,
+            'special': False,
+            'text': ' "',
+          }),
+          dict({
+            'id': 6441,
+            'special': False,
+            'text': 'mem',
+          }),
+          dict({
+            'id': 70,
+            'special': False,
+            'text': 'e',
+          }),
+          dict({
+            'id': 3,
+            'special': False,
+            'text': '"',
+          }),
+          dict({
+            'id': 369,
+            'special': False,
+            'text': ' was',
+          }),
+          dict({
+            'id': 806,
+            'special': False,
+            'text': ' first',
+          }),
+          dict({
+            'id': 908,
+            'special': False,
+            'text': ' used',
+          }),
+          dict({
+            'id': 275,
+            'special': False,
+            'text': ' in',
+          }),
+        ]),
+      }),
+      'generated_text': 'The word "meme" was first used in',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 50278,
+            'text': '<|prompter|>',
+          }),
+          dict({
+            'id': 1276,
+            'text': 'What',
+          }),
+          dict({
+            'id': 310,
+            'text': ' is',
+          }),
+          dict({
+            'id': 247,
+            'text': ' a',
+          }),
+          dict({
+            'id': 1167,
+            'text': ' mem',
+          }),
+          dict({
+            'id': 70,
+            'text': 'e',
+          }),
+          dict({
+            'id': 13,
+            'text': ',',
+          }),
+          dict({
+            'id': 285,
+            'text': ' and',
+          }),
+          dict({
+            'id': 752,
+            'text': ' what',
+          }),
+          dict({
+            'id': 434,
+            'text': "'s",
+          }),
+          dict({
+            'id': 253,
+            'text': ' the',
+          }),
+          dict({
+            'id': 2892,
+            'text': ' history',
+          }),
+          dict({
+            'id': 3212,
+            'text': ' behind',
+          }),
+          dict({
+            'id': 436,
+            'text': ' this',
+          }),
+          dict({
+            'id': 3159,
+            'text': ' word',
+          }),
+          dict({
+            'id': 32,
+            'text': '?',
+          }),
+          dict({
+            'id': 0,
+            'text': '<|endoftext|>',
+          }),
+          dict({
+            'id': 50281,
+            'text': '<|assistant|>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 510,
+            'special': False,
+            'text': 'The',
+          }),
+          dict({
+            'id': 3159,
+            'special': False,
+            'text': ' word',
+          }),
+          dict({
+            'id': 346,
+            'special': False,
+            'text': ' "',
+          }),
+          dict({
+            'id': 6441,
+            'special': False,
+            'text': 'mem',
+          }),
+          dict({
+            'id': 70,
+            'special': False,
+            'text': 'e',
+          }),
+          dict({
+            'id': 3,
+            'special': False,
+            'text': '"',
+          }),
+          dict({
+            'id': 369,
+            'special': False,
+            'text': ' was',
+          }),
+          dict({
+            'id': 806,
+            'special': False,
+            'text': ' first',
+          }),
+          dict({
+            'id': 908,
+            'special': False,
+            'text': ' used',
+          }),
+          dict({
+            'id': 275,
+            'special': False,
+            'text': ' in',
+          }),
+        ]),
+      }),
+      'generated_text': 'The word "meme" was first used in',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 50278,
+            'text': '<|prompter|>',
+          }),
+          dict({
+            'id': 1276,
+            'text': 'What',
+          }),
+          dict({
+            'id': 310,
+            'text': ' is',
+          }),
+          dict({
+            'id': 247,
+            'text': ' a',
+          }),
+          dict({
+            'id': 1167,
+            'text': ' mem',
+          }),
+          dict({
+            'id': 70,
+            'text': 'e',
+          }),
+          dict({
+            'id': 13,
+            'text': ',',
+          }),
+          dict({
+            'id': 285,
+            'text': ' and',
+          }),
+          dict({
+            'id': 752,
+            'text': ' what',
+          }),
+          dict({
+            'id': 434,
+            'text': "'s",
+          }),
+          dict({
+            'id': 253,
+            'text': ' the',
+          }),
+          dict({
+            'id': 2892,
+            'text': ' history',
+          }),
+          dict({
+            'id': 3212,
+            'text': ' behind',
+          }),
+          dict({
+            'id': 436,
+            'text': ' this',
+          }),
+          dict({
+            'id': 3159,
+            'text': ' word',
+          }),
+          dict({
+            'id': 32,
+            'text': '?',
+          }),
+          dict({
+            'id': 0,
+            'text': '<|endoftext|>',
+          }),
+          dict({
+            'id': 50281,
+            'text': '<|assistant|>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 510,
+            'special': False,
+            'text': 'The',
+          }),
+          dict({
+            'id': 3159,
+            'special': False,
+            'text': ' word',
+          }),
+          dict({
+            'id': 346,
+            'special': False,
+            'text': ' "',
+          }),
+          dict({
+            'id': 6441,
+            'special': False,
+            'text': 'mem',
+          }),
+          dict({
+            'id': 70,
+            'special': False,
+            'text': 'e',
+          }),
+          dict({
+            'id': 3,
+            'special': False,
+            'text': '"',
+          }),
+          dict({
+            'id': 369,
+            'special': False,
+            'text': ' was',
+          }),
+          dict({
+            'id': 806,
+            'special': False,
+            'text': ' first',
+          }),
+          dict({
+            'id': 908,
+            'special': False,
+            'text': ' used',
+          }),
+          dict({
+            'id': 275,
+            'special': False,
+            'text': ' in',
+          }),
+        ]),
+      }),
+      'generated_text': 'The word "meme" was first used in',
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_santacoder.ambr b/integration-tests/models/__snapshots__/test_flash_santacoder.ambr
new file mode 100644
index 00000000..030820cb
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_santacoder.ambr
@@ -0,0 +1,472 @@
+# serializer version: 1
+# name: test_flash_santacoder
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 563,
+          'text': 'def',
+        }),
+        dict({
+          'id': 942,
+          'text': ' print',
+        }),
+        dict({
+          'id': 62,
+          'text': '_',
+        }),
+        dict({
+          'id': 7196,
+          'text': 'hello',
+        }),
+      ]),
+      'seed': None,
+      'tokens': list([
+        dict({
+          'id': 1241,
+          'special': False,
+          'text': '():',
+        }),
+        dict({
+          'id': 258,
+          'special': False,
+          'text': '''
+            
+               
+          ''',
+        }),
+        dict({
+          'id': 942,
+          'special': False,
+          'text': ' print',
+        }),
+        dict({
+          'id': 372,
+          'special': False,
+          'text': '("',
+        }),
+        dict({
+          'id': 7371,
+          'special': False,
+          'text': 'Hello',
+        }),
+        dict({
+          'id': 9956,
+          'special': False,
+          'text': ' World',
+        }),
+        dict({
+          'id': 8657,
+          'special': False,
+          'text': '!")',
+        }),
+        dict({
+          'id': 185,
+          'special': False,
+          'text': '''
+            
+  
+          ''',
+        }),
+        dict({
+          'id': 185,
+          'special': False,
+          'text': '''
+            
+  
+          ''',
+        }),
+        dict({
+          'id': 1018,
+          'special': False,
+          'text': 'print',
+        }),
+      ]),
+    }),
+    'generated_text': '''
+      ():
+          print("Hello World!")
+      
+      print
+    ''',
+  })
+# ---
+# name: test_flash_santacoder_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 563,
+            'text': 'def',
+          }),
+          dict({
+            'id': 942,
+            'text': ' print',
+          }),
+          dict({
+            'id': 62,
+            'text': '_',
+          }),
+          dict({
+            'id': 7196,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 1241,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 258,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 942,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 372,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 7371,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 9956,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 8657,
+            'special': False,
+            'text': '!")',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 1018,
+            'special': False,
+            'text': 'print',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World!")
+        
+        print
+      ''',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 563,
+            'text': 'def',
+          }),
+          dict({
+            'id': 942,
+            'text': ' print',
+          }),
+          dict({
+            'id': 62,
+            'text': '_',
+          }),
+          dict({
+            'id': 7196,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 1241,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 258,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 942,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 372,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 7371,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 9956,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 8657,
+            'special': False,
+            'text': '!")',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 1018,
+            'special': False,
+            'text': 'print',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World!")
+        
+        print
+      ''',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 563,
+            'text': 'def',
+          }),
+          dict({
+            'id': 942,
+            'text': ' print',
+          }),
+          dict({
+            'id': 62,
+            'text': '_',
+          }),
+          dict({
+            'id': 7196,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 1241,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 258,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 942,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 372,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 7371,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 9956,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 8657,
+            'special': False,
+            'text': '!")',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 1018,
+            'special': False,
+            'text': 'print',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World!")
+        
+        print
+      ''',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 563,
+            'text': 'def',
+          }),
+          dict({
+            'id': 942,
+            'text': ' print',
+          }),
+          dict({
+            'id': 62,
+            'text': '_',
+          }),
+          dict({
+            'id': 7196,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 1241,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 258,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 942,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 372,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 7371,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 9956,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 8657,
+            'special': False,
+            'text': '!")',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 185,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 1018,
+            'special': False,
+            'text': 'print',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World!")
+        
+        print
+      ''',
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder.ambr b/integration-tests/models/__snapshots__/test_flash_starcoder.ambr
new file mode 100644
index 00000000..e0f4b568
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder.ambr
@@ -0,0 +1,573 @@
+# serializer version: 1
+# name: test_flash_starcoder
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 589,
+          'text': 'def',
+        }),
+        dict({
+          'id': 1459,
+          'text': ' print',
+        }),
+        dict({
+          'id': 81,
+          'text': '_',
+        }),
+        dict({
+          'id': 7656,
+          'text': 'hello',
+        }),
+      ]),
+      'seed': None,
+      'tokens': list([
+        dict({
+          'id': 2262,
+          'special': False,
+          'text': '():',
+        }),
+        dict({
+          'id': 284,
+          'special': False,
+          'text': '''
+            
+               
+          ''',
+        }),
+        dict({
+          'id': 1459,
+          'special': False,
+          'text': ' print',
+        }),
+        dict({
+          'id': 440,
+          'special': False,
+          'text': '("',
+        }),
+        dict({
+          'id': 8279,
+          'special': False,
+          'text': 'Hello',
+        }),
+        dict({
+          'id': 10896,
+          'special': False,
+          'text': ' World',
+        }),
+        dict({
+          'id': 657,
+          'special': False,
+          'text': '")',
+        }),
+        dict({
+          'id': 203,
+          'special': False,
+          'text': '''
+            
+  
+          ''',
+        }),
+        dict({
+          'id': 203,
+          'special': False,
+          'text': '''
+            
+  
+          ''',
+        }),
+        dict({
+          'id': 589,
+          'special': False,
+          'text': 'def',
+        }),
+      ]),
+    }),
+    'generated_text': '''
+      ():
+          print("Hello World")
+      
+      def
+    ''',
+  })
+# ---
+# name: test_flash_starcoder_default_params
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
+      'generated_tokens': 12,
+      'prefill': list([
+        dict({
+          'id': 589,
+          'text': 'def',
+        }),
+        dict({
+          'id': 1459,
+          'text': ' print',
+        }),
+        dict({
+          'id': 81,
+          'text': '_',
+        }),
+        dict({
+          'id': 7656,
+          'text': 'hello',
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 2262,
+          'special': False,
+          'text': '():',
+        }),
+        dict({
+          'id': 284,
+          'special': False,
+          'text': '''
+            
+               
+          ''',
+        }),
+        dict({
+          'id': 5741,
+          'special': False,
+          'text': ' logging',
+        }),
+        dict({
+          'id': 32,
+          'special': False,
+          'text': '.',
+        }),
+        dict({
+          'id': 1338,
+          'special': False,
+          'text': 'info',
+        }),
+        dict({
+          'id': 463,
+          'special': False,
+          'text': "('",
+        }),
+        dict({
+          'id': 8279,
+          'special': False,
+          'text': 'Hello',
+        }),
+        dict({
+          'id': 30,
+          'special': False,
+          'text': ',',
+        }),
+        dict({
+          'id': 10896,
+          'special': False,
+          'text': ' World',
+        }),
+        dict({
+          'id': 683,
+          'special': False,
+          'text': "')",
+        }),
+        dict({
+          'id': 203,
+          'special': False,
+          'text': '''
+            
+  
+          ''',
+        }),
+        dict({
+          'id': 0,
+          'special': True,
+          'text': '<|endoftext|>',
+        }),
+      ]),
+    }),
+    'generated_text': '''
+      ():
+          logging.info('Hello, World')
+      <|endoftext|>
+    ''',
+  })
+# ---
+# name: test_flash_starcoder_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 589,
+            'text': 'def',
+          }),
+          dict({
+            'id': 1459,
+            'text': ' print',
+          }),
+          dict({
+            'id': 81,
+            'text': '_',
+          }),
+          dict({
+            'id': 7656,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 2262,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 284,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 1459,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 440,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 8279,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 10896,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 657,
+            'special': False,
+            'text': '")',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 589,
+            'special': False,
+            'text': 'def',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World")
+        
+        def
+      ''',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 589,
+            'text': 'def',
+          }),
+          dict({
+            'id': 1459,
+            'text': ' print',
+          }),
+          dict({
+            'id': 81,
+            'text': '_',
+          }),
+          dict({
+            'id': 7656,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 2262,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 284,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 1459,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 440,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 8279,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 10896,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 657,
+            'special': False,
+            'text': '")',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 589,
+            'special': False,
+            'text': 'def',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World")
+        
+        def
+      ''',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 589,
+            'text': 'def',
+          }),
+          dict({
+            'id': 1459,
+            'text': ' print',
+          }),
+          dict({
+            'id': 81,
+            'text': '_',
+          }),
+          dict({
+            'id': 7656,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 2262,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 284,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 1459,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 440,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 8279,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 10896,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 657,
+            'special': False,
+            'text': '")',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 589,
+            'special': False,
+            'text': 'def',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World")
+        
+        def
+      ''',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.Length: 'length'>,
+        'generated_tokens': 10,
+        'prefill': list([
+          dict({
+            'id': 589,
+            'text': 'def',
+          }),
+          dict({
+            'id': 1459,
+            'text': ' print',
+          }),
+          dict({
+            'id': 81,
+            'text': '_',
+          }),
+          dict({
+            'id': 7656,
+            'text': 'hello',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 2262,
+            'special': False,
+            'text': '():',
+          }),
+          dict({
+            'id': 284,
+            'special': False,
+            'text': '''
+              
+                 
+            ''',
+          }),
+          dict({
+            'id': 1459,
+            'special': False,
+            'text': ' print',
+          }),
+          dict({
+            'id': 440,
+            'special': False,
+            'text': '("',
+          }),
+          dict({
+            'id': 8279,
+            'special': False,
+            'text': 'Hello',
+          }),
+          dict({
+            'id': 10896,
+            'special': False,
+            'text': ' World',
+          }),
+          dict({
+            'id': 657,
+            'special': False,
+            'text': '")',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 203,
+            'special': False,
+            'text': '''
+              
+  
+            ''',
+          }),
+          dict({
+            'id': 589,
+            'special': False,
+            'text': 'def',
+          }),
+        ]),
+      }),
+      'generated_text': '''
+        ():
+            print("Hello World")
+        
+        def
+      ''',
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/__snapshots__/test_mt0_base.ambr b/integration-tests/models/__snapshots__/test_mt0_base.ambr
new file mode 100644
index 00000000..d7c6eaf6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mt0_base.ambr
@@ -0,0 +1,306 @@
+# serializer version: 1
+# name: test_mt0_base
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
+      'generated_tokens': 5,
+      'prefill': list([
+        dict({
+          'id': 0,
+          'text': '<pad>',
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 926,
+          'special': False,
+          'text': 'To',
+        }),
+        dict({
+          'id': 18295,
+          'special': False,
+          'text': ' sell',
+        }),
+        dict({
+          'id': 7868,
+          'special': False,
+          'text': ' things',
+        }),
+        dict({
+          'id': 260,
+          'special': False,
+          'text': '.',
+        }),
+        dict({
+          'id': 1,
+          'special': True,
+          'text': '</s>',
+        }),
+      ]),
+    }),
+    'generated_text': 'To sell things.',
+  })
+# ---
+# name: test_mt0_base_all_params
+  dict({
+    'details': dict({
+      'best_of_sequences': None,
+      'finish_reason': <FinishReason.Length: 'length'>,
+      'generated_tokens': 10,
+      'prefill': list([
+        dict({
+          'id': 0,
+          'text': '<pad>',
+        }),
+      ]),
+      'seed': 0,
+      'tokens': list([
+        dict({
+          'id': 16017,
+          'special': False,
+          'text': 'blue',
+        }),
+        dict({
+          'id': 20495,
+          'special': False,
+          'text': ' sky',
+        }),
+        dict({
+          'id': 259,
+          'special': False,
+          'text': ' ',
+        }),
+        dict({
+          'id': 15484,
+          'special': False,
+          'text': 'appear',
+        }),
+        dict({
+          'id': 345,
+          'special': False,
+          'text': 'ed',
+        }),
+        dict({
+          'id': 288,
+          'special': False,
+          'text': ' to',
+        }),
+        dict({
+          'id': 35622,
+          'special': False,
+          'text': ' cloud',
+        }),
+        dict({
+          'id': 263,
+          'special': False,
+          'text': 's',
+        }),
+        dict({
+          'id': 14701,
+          'special': False,
+          'text': ' above',
+        }),
+        dict({
+          'id': 751,
+          'special': False,
+          'text': ' all',
+        }),
+      ]),
+    }),
+    'generated_text': 'Why is the sky blue?blue sky appeared to clouds above all',
+  })
+# ---
+# name: test_mt0_base_load
+  list([
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
+        'generated_tokens': 6,
+        'prefill': list([
+          dict({
+            'id': 0,
+            'text': '<pad>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 259,
+            'special': False,
+            'text': '',
+          }),
+          dict({
+            'id': 39261,
+            'special': False,
+            'text': 'Because',
+          }),
+          dict({
+            'id': 609,
+            'special': False,
+            'text': ' it',
+          }),
+          dict({
+            'id': 339,
+            'special': False,
+            'text': ' is',
+          }),
+          dict({
+            'id': 16017,
+            'special': False,
+            'text': ' blue',
+          }),
+          dict({
+            'id': 1,
+            'special': True,
+            'text': '</s>',
+          }),
+        ]),
+      }),
+      'generated_text': 'Because it is blue',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
+        'generated_tokens': 6,
+        'prefill': list([
+          dict({
+            'id': 0,
+            'text': '<pad>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 259,
+            'special': False,
+            'text': '',
+          }),
+          dict({
+            'id': 39261,
+            'special': False,
+            'text': 'Because',
+          }),
+          dict({
+            'id': 609,
+            'special': False,
+            'text': ' it',
+          }),
+          dict({
+            'id': 339,
+            'special': False,
+            'text': ' is',
+          }),
+          dict({
+            'id': 16017,
+            'special': False,
+            'text': ' blue',
+          }),
+          dict({
+            'id': 1,
+            'special': True,
+            'text': '</s>',
+          }),
+        ]),
+      }),
+      'generated_text': 'Because it is blue',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
+        'generated_tokens': 6,
+        'prefill': list([
+          dict({
+            'id': 0,
+            'text': '<pad>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 259,
+            'special': False,
+            'text': '',
+          }),
+          dict({
+            'id': 39261,
+            'special': False,
+            'text': 'Because',
+          }),
+          dict({
+            'id': 609,
+            'special': False,
+            'text': ' it',
+          }),
+          dict({
+            'id': 339,
+            'special': False,
+            'text': ' is',
+          }),
+          dict({
+            'id': 16017,
+            'special': False,
+            'text': ' blue',
+          }),
+          dict({
+            'id': 1,
+            'special': True,
+            'text': '</s>',
+          }),
+        ]),
+      }),
+      'generated_text': 'Because it is blue',
+    }),
+    dict({
+      'details': dict({
+        'best_of_sequences': None,
+        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
+        'generated_tokens': 6,
+        'prefill': list([
+          dict({
+            'id': 0,
+            'text': '<pad>',
+          }),
+        ]),
+        'seed': None,
+        'tokens': list([
+          dict({
+            'id': 259,
+            'special': False,
+            'text': '',
+          }),
+          dict({
+            'id': 39261,
+            'special': False,
+            'text': 'Because',
+          }),
+          dict({
+            'id': 609,
+            'special': False,
+            'text': ' it',
+          }),
+          dict({
+            'id': 339,
+            'special': False,
+            'text': ' is',
+          }),
+          dict({
+            'id': 16017,
+            'special': False,
+            'text': ' blue',
+          }),
+          dict({
+            'id': 1,
+            'special': True,
+            'text': '</s>',
+          }),
+        ]),
+      }),
+      'generated_text': 'Because it is blue',
+    }),
+  ])
+# ---
diff --git a/integration-tests/models/test_bloom_560m.py b/integration-tests/models/test_bloom_560m.py
new file mode 100644
index 00000000..e13606f7
--- /dev/null
+++ b/integration-tests/models/test_bloom_560m.py
@@ -0,0 +1,63 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def bloom_560(launcher):
+    with launcher("bigscience/bloom-560m") as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_bloom_560m(bloom_560, snapshot_test):
+    await health_check(bloom_560, 60)
+
+    response = await bloom_560.generate(
+        "Pour déguster un ortolan, il faut tout d'abord",
+        max_new_tokens=10,
+        top_p=0.9,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_bloom_560m_all_params(bloom_560, snapshot_test):
+    await health_check(bloom_560, 60)
+
+    response = await bloom_560.generate(
+        "Pour déguster un ortolan, il faut tout d'abord",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_bloom_560m_load(bloom_560, generate_load, snapshot_test):
+    await health_check(bloom_560, 60)
+
+    responses = await generate_load(
+        bloom_560,
+        "Pour déguster un ortolan, il faut tout d'abord",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/test_bloom_560m_sharded.py b/integration-tests/models/test_bloom_560m_sharded.py
new file mode 100644
index 00000000..bfb70253
--- /dev/null
+++ b/integration-tests/models/test_bloom_560m_sharded.py
@@ -0,0 +1,42 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def bloom_560m_sharded(launcher):
+    with launcher("bigscience/bloom-560m", num_shard=2) as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_bloom_560m_sharded(bloom_560m_sharded, snapshot_test):
+    await health_check(bloom_560m_sharded, 60)
+
+    response = await bloom_560m_sharded.generate(
+        "Pour déguster un ortolan, il faut tout d'abord",
+        max_new_tokens=10,
+        top_p=0.9,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_bloom_560m_sharded_load(
+    bloom_560m_sharded, generate_load, snapshot_test
+):
+    await health_check(bloom_560m_sharded, 60)
+
+    responses = await generate_load(
+        bloom_560m_sharded,
+        "Pour déguster un ortolan, il faut tout d'abord",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/test_flash_llama.py b/integration-tests/models/test_flash_llama.py
new file mode 100644
index 00000000..4d1f2bcf
--- /dev/null
+++ b/integration-tests/models/test_flash_llama.py
@@ -0,0 +1,56 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def flash_llama(launcher):
+    with launcher("huggingface/llama-7b", num_shard=2) as client:
+        yield client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama(flash_llama, snapshot_test):
+    await health_check(flash_llama, 120)
+
+    response = await flash_llama.generate("Test request", max_new_tokens=10)
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_all_params(flash_llama, snapshot_test):
+    await health_check(flash_llama, 120)
+
+    response = await flash_llama.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_load(flash_llama, generate_load, snapshot_test):
+    await health_check(flash_llama, 120)
+
+    responses = await generate_load(flash_llama, "Test request", max_new_tokens=10, n=4)
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/test_flash_neox.py b/integration-tests/models/test_flash_neox.py
new file mode 100644
index 00000000..8c981028
--- /dev/null
+++ b/integration-tests/models/test_flash_neox.py
@@ -0,0 +1,38 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def flash_neox(launcher):
+    with launcher("OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2) as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_flash_neox(flash_neox, snapshot_test):
+    await health_check(flash_neox, 240)
+
+    response = await flash_neox.generate(
+        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
+        max_new_tokens=10,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_flash_neox_load(flash_neox, generate_load, snapshot_test):
+    await health_check(flash_neox, 240)
+
+    responses = await generate_load(
+        flash_neox,
+        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/test_flash_santacoder.py b/integration-tests/models/test_flash_santacoder.py
new file mode 100644
index 00000000..64a59d78
--- /dev/null
+++ b/integration-tests/models/test_flash_santacoder.py
@@ -0,0 +1,32 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def flash_santacoder(launcher):
+    with launcher("bigcode/santacoder") as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_flash_santacoder(flash_santacoder, snapshot_test):
+    await health_check(flash_santacoder, 60)
+
+    response = await flash_santacoder.generate("def print_hello", max_new_tokens=10)
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_flash_santacoder_load(flash_santacoder, generate_load, snapshot_test):
+    await health_check(flash_santacoder, 60)
+
+    responses = await generate_load(
+        flash_santacoder, "def print_hello", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/test_flash_starcoder.py b/integration-tests/models/test_flash_starcoder.py
new file mode 100644
index 00000000..d43e92dc
--- /dev/null
+++ b/integration-tests/models/test_flash_starcoder.py
@@ -0,0 +1,47 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def flash_starcoder(launcher):
+    with launcher("bigcode/starcoder", num_shard=2) as client:
+        yield client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder(flash_starcoder, snapshot_test):
+    await health_check(flash_starcoder, 240)
+
+    response = await flash_starcoder.generate("def print_hello", max_new_tokens=10)
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder_default_params(flash_starcoder, snapshot_test):
+    await health_check(flash_starcoder, 240)
+
+    response = await flash_starcoder.generate(
+        "def print_hello", max_new_tokens=60, temperature=0.2, top_p=0.95, seed=0
+    )
+
+    assert response.details.generated_tokens == 12
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder_load(flash_starcoder, generate_load, snapshot_test):
+    await health_check(flash_starcoder, 240)
+
+    responses = await generate_load(
+        flash_starcoder, "def print_hello", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/test_mt0_base.py b/integration-tests/models/test_mt0_base.py
new file mode 100644
index 00000000..7310a30f
--- /dev/null
+++ b/integration-tests/models/test_mt0_base.py
@@ -0,0 +1,63 @@
+import pytest
+
+from utils import health_check
+
+
+@pytest.fixture(scope="module")
+def mt0_base(launcher):
+    with launcher("bigscience/mt0-base") as client:
+        yield client
+
+
+@pytest.mark.asyncio
+async def test_mt0_base(mt0_base, snapshot_test):
+    await health_check(mt0_base, 60)
+
+    response = await mt0_base.generate(
+        "Why is the sky blue?",
+        max_new_tokens=10,
+        top_p=0.9,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 5
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_mt0_base_all_params(mt0_base, snapshot_test):
+    await health_check(mt0_base, 60)
+
+    response = await mt0_base.generate(
+        "Why is the sky blue?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert snapshot_test(response)
+
+
+@pytest.mark.asyncio
+async def test_mt0_base_load(mt0_base, generate_load, snapshot_test):
+    await health_check(mt0_base, 60)
+
+    responses = await generate_load(
+        mt0_base,
+        "Why is the sky blue?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert len(responses) == 4
+
+    assert snapshot_test(responses)
diff --git a/integration-tests/models/utils.py b/integration-tests/models/utils.py
new file mode 100644
index 00000000..c47e4871
--- /dev/null
+++ b/integration-tests/models/utils.py
@@ -0,0 +1,15 @@
+import time
+
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from text_generation import AsyncClient
+
+
+async def health_check(client: AsyncClient, timeout: int = 60):
+    assert timeout > 0
+    for _ in range(timeout):
+        try:
+            await client.generate("test")
+            return
+        except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+            time.sleep(1)
+    raise RuntimeError("Health check failed")
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
new file mode 100644
index 00000000..9ecbb2ee
--- /dev/null
+++ b/integration-tests/requirements.txt
@@ -0,0 +1,5 @@
+syrupy
+text-generation==0.5.1
+pytest
+pytest-asyncio==0.17.2
+docker
\ No newline at end of file
diff --git a/launcher/tests/bloom_560m.json b/launcher/tests/bloom_560m.json
deleted file mode 100644
index 96f89f6b..00000000
--- a/launcher/tests/bloom_560m.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
-  "generated_text": ".get(\"action\");\n        if (action == null) {\n            throw new RuntimeException",
-  "details": {
-    "finish_reason": "length",
-    "generated_tokens": 20,
-    "seed": null,
-    "prefill": [
-      {
-        "id": 10264,
-        "text": "Test",
-        "logprob": null
-      },
-      {
-        "id": 8821,
-        "text": " request",
-        "logprob": -11.894989
-      }
-    ],
-    "tokens": [
-      {
-        "id": 17,
-        "text": ".",
-        "logprob": -1.8267672,
-        "special": false
-      },
-      {
-        "id": 1587,
-        "text": "get",
-        "logprob": -2.4674969,
-        "special": false
-      },
-      {
-        "id": 11,
-        "text": "(",
-        "logprob": -1.906001,
-        "special": false
-      },
-      {
-        "id": 5,
-        "text": "\"",
-        "logprob": -1.2279545,
-        "special": false
-      },
-      {
-        "id": 4899,
-        "text": "action",
-        "logprob": -4.170299,
-        "special": false
-      },
-      {
-        "id": 5,
-        "text": "\"",
-        "logprob": -0.32478866,
-        "special": false
-      },
-      {
-        "id": 12,
-        "text": ")",
-        "logprob": -1.0773665,
-        "special": false
-      },
-      {
-        "id": 30,
-        "text": ";",
-        "logprob": -0.27640742,
-        "special": false
-      },
-      {
-        "id": 837,
-        "text": "\n       ",
-        "logprob": -1.6970354,
-        "special": false
-      },
-      {
-        "id": 1320,
-        "text": " if",
-        "logprob": -1.4495516,
-        "special": false
-      },
-      {
-        "id": 375,
-        "text": " (",
-        "logprob": -0.23609057,
-        "special": false
-      },
-      {
-        "id": 4899,
-        "text": "action",
-        "logprob": -1.1916996,
-        "special": false
-      },
-      {
-        "id": 3535,
-        "text": " ==",
-        "logprob": -0.8918753,
-        "special": false
-      },
-      {
-        "id": 5109,
-        "text": " null",
-        "logprob": -0.3933342,
-        "special": false
-      },
-      {
-        "id": 12,
-        "text": ")",
-        "logprob": -0.43212673,
-        "special": false
-      },
-      {
-        "id": 731,
-        "text": " {",
-        "logprob": -0.17702064,
-        "special": false
-      },
-      {
-        "id": 1260,
-        "text": "\n           ",
-        "logprob": -0.07027565,
-        "special": false
-      },
-      {
-        "id": 10519,
-        "text": " throw",
-        "logprob": -1.3915029,
-        "special": false
-      },
-      {
-        "id": 2084,
-        "text": " new",
-        "logprob": -0.04201372,
-        "special": false
-      },
-      {
-        "id": 150858,
-        "text": " RuntimeException",
-        "logprob": -1.7329919,
-        "special": false
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/launcher/tests/integration_tests.rs b/launcher/tests/integration_tests.rs
deleted file mode 100644
index 0d2b6c74..00000000
--- a/launcher/tests/integration_tests.rs
+++ /dev/null
@@ -1,172 +0,0 @@
-use float_eq::assert_float_eq;
-use serde::Deserialize;
-use serde_json::Value;
-use std::fs::File;
-use std::io::{BufRead, BufReader};
-use std::path::PathBuf;
-use std::thread;
-use std::thread::sleep;
-use std::time::Duration;
-use subprocess::{Popen, PopenConfig, Redirection};
-
-#[derive(Deserialize)]
-pub struct Token {
-    id: u32,
-    text: String,
-    logprob: Option<f32>,
-    special: bool,
-}
-
-#[derive(Deserialize)]
-struct Details {
-    finish_reason: String,
-    generated_tokens: u32,
-    tokens: Vec<Token>,
-}
-
-#[derive(Deserialize)]
-struct GeneratedText {
-    generated_text: String,
-    details: Details,
-}
-
-fn start_launcher(model_id: String, num_shard: usize, port: usize, master_port: usize) -> Popen {
-    let argv = vec![
-        "text-generation-launcher".to_string(),
-        "--model-id".to_string(),
-        model_id.clone(),
-        "--num-shard".to_string(),
-        num_shard.to_string(),
-        "--port".to_string(),
-        port.to_string(),
-        "--master-port".to_string(),
-        master_port.to_string(),
-        "--shard-uds-path".to_string(),
-        format!("/tmp/test-{}-{}-{}", num_shard, port, master_port),
-    ];
-
-    let mut launcher = Popen::create(
-        &argv,
-        PopenConfig {
-            stdout: Redirection::Pipe,
-            stderr: Redirection::Merge,
-            ..Default::default()
-        },
-    )
-    .expect("Could not start launcher");
-
-    // Redirect STDOUT and STDERR to the console
-    // (STDERR is merged into STDOUT)
-    let launcher_stdout = launcher.stdout.take().unwrap();
-
-    thread::spawn(move || {
-        let stdout = BufReader::new(launcher_stdout);
-        for line in stdout.lines() {
-            println!("{}", line.unwrap());
-        }
-    });
-
-    for _ in 0..60 {
-        let health = reqwest::blocking::get(format!("http://localhost:{}/health", port));
-        if health.is_ok() {
-            return launcher;
-        }
-        sleep(Duration::from_secs(2));
-    }
-
-    launcher.terminate().unwrap();
-    launcher.wait().unwrap();
-    panic!("failed to launch {}", model_id)
-}
-
-fn test_model(
-    model_id: String,
-    num_shard: usize,
-    port: usize,
-    master_port: usize,
-) -> GeneratedText {
-    let mut launcher = start_launcher(model_id, num_shard, port, master_port);
-
-    let data = r#"
-        {
-            "inputs": "Test request",
-            "parameters": {
-                "details": true
-            }
-        }"#;
-    let req: Value = serde_json::from_str(data).unwrap();
-
-    let client = reqwest::blocking::Client::new();
-    let res = client
-        .post(format!("http://localhost:{}/generate", port))
-        .json(&req)
-        .send();
-
-    launcher.terminate().unwrap();
-    launcher.wait().unwrap();
-
-    let result: GeneratedText = res.unwrap().json().unwrap();
-    result
-}
-
-fn read_json(name: &str) -> GeneratedText {
-    let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    d.push("tests/");
-    d.push(name);
-
-    let file = File::open(d).unwrap();
-    let reader = BufReader::new(file);
-
-    let result: GeneratedText = serde_json::from_reader(reader).unwrap();
-    result
-}
-
-fn compare_results(result: GeneratedText, expected: GeneratedText) {
-    assert_eq!(result.generated_text, expected.generated_text);
-    assert_eq!(result.details.finish_reason, expected.details.finish_reason);
-    assert_eq!(
-        result.details.generated_tokens,
-        expected.details.generated_tokens
-    );
-
-    for (token, expected_token) in result
-        .details
-        .tokens
-        .into_iter()
-        .zip(expected.details.tokens.into_iter())
-    {
-        assert_eq!(token.id, expected_token.id);
-        assert_eq!(token.text, expected_token.text);
-        assert_eq!(token.special, expected_token.special);
-        if let Some(logprob) = token.logprob {
-            let expected_logprob = expected_token.logprob.unwrap();
-            assert_float_eq!(logprob, expected_logprob, abs <= 0.001);
-        } else {
-            assert_eq!(token.logprob, expected_token.logprob);
-        }
-    }
-}
-
-#[test]
-fn test_bloom_560m() {
-    let expected = read_json("bloom_560m.json");
-
-    let result = test_model("bigscience/bloom-560m".to_string(), 1, 3000, 29500);
-    compare_results(result, expected);
-}
-
-#[test]
-fn test_bloom_560m_distributed() {
-    let expected = read_json("bloom_560m.json");
-
-    let result = test_model("bigscience/bloom-560m".to_string(), 2, 3001, 29501);
-    compare_results(result, expected);
-}
-
-#[test]
-fn test_mt0_base() {
-    let expected = read_json("mt0_base.json");
-
-    let result = test_model("bigscience/mt0-base".to_string(), 1, 3002, 29502);
-    compare_results(result, expected);
-}
diff --git a/launcher/tests/mt0_base.json b/launcher/tests/mt0_base.json
deleted file mode 100644
index f5be63f9..00000000
--- a/launcher/tests/mt0_base.json
+++ /dev/null
@@ -1,137 +0,0 @@
-{
-  "generated_text": "\"\"\"Test the contents of the contents of the contents. \"\"\" test_test",
-  "details": {
-    "finish_reason": "length",
-    "generated_tokens": 20,
-    "seed": null,
-    "prefill": [
-      {
-        "id": 0,
-        "text": "<pad>",
-        "logprob": null
-      }
-    ],
-    "tokens": [
-      {
-        "id": 259,
-        "text": "",
-        "logprob": -1.3656927,
-        "special": false
-      },
-      {
-        "id": 215100,
-        "text": "\"\"\"",
-        "logprob": -2.6551573,
-        "special": false
-      },
-      {
-        "id": 46138,
-        "text": "Test",
-        "logprob": -1.8059857,
-        "special": false
-      },
-      {
-        "id": 287,
-        "text": " the",
-        "logprob": -1.2102449,
-        "special": false
-      },
-      {
-        "id": 259,
-        "text": " ",
-        "logprob": -1.6057279,
-        "special": false
-      },
-      {
-        "id": 49076,
-        "text": "contents",
-        "logprob": -3.6060903,
-        "special": false
-      },
-      {
-        "id": 304,
-        "text": " of",
-        "logprob": -0.5270343,
-        "special": false
-      },
-      {
-        "id": 287,
-        "text": " the",
-        "logprob": -0.62522805,
-        "special": false
-      },
-      {
-        "id": 259,
-        "text": " ",
-        "logprob": -1.4069618,
-        "special": false
-      },
-      {
-        "id": 49076,
-        "text": "contents",
-        "logprob": -2.621994,
-        "special": false
-      },
-      {
-        "id": 304,
-        "text": " of",
-        "logprob": -1.3172221,
-        "special": false
-      },
-      {
-        "id": 287,
-        "text": " the",
-        "logprob": -0.3501925,
-        "special": false
-      },
-      {
-        "id": 259,
-        "text": " ",
-        "logprob": -0.7219573,
-        "special": false
-      },
-      {
-        "id": 49076,
-        "text": "contents",
-        "logprob": -1.0494149,
-        "special": false
-      },
-      {
-        "id": 260,
-        "text": ".",
-        "logprob": -1.0803378,
-        "special": false
-      },
-      {
-        "id": 259,
-        "text": " ",
-        "logprob": -0.32933083,
-        "special": false
-      },
-      {
-        "id": 215100,
-        "text": "\"\"\"",
-        "logprob": -0.11268901,
-        "special": false
-      },
-      {
-        "id": 2978,
-        "text": " test",
-        "logprob": -1.5846587,
-        "special": false
-      },
-      {
-        "id": 290,
-        "text": "_",
-        "logprob": -0.49796978,
-        "special": false
-      },
-      {
-        "id": 4125,
-        "text": "test",
-        "logprob": -2.0026445,
-        "special": false
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index ed959291..9029e954 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -129,7 +129,7 @@ class BLOOMSharded(BLOOM):
         parameters = dict(model.named_parameters())
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     full_name = f"transformer.{name}"
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 11f3766e..54670b79 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -21,16 +21,14 @@
 import torch
 import torch.distributed
 
-from torch.nn import functional as F
-
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional
 
 # Flash attention imports
 import flash_attn_cuda
+import dropout_layer_norm
 
-from flash_attn.layers.rotary import RotaryEmbedding
 from text_generation_server.utils.layers import (
     FastLinear,
     TensorParallelRowLinear,
@@ -332,15 +330,15 @@ class FlashLlamaModel(torch.nn.Module):
         self.head_size = self.layers[0].self_attn.head_size
         self.num_heads = self.layers[0].self_attn.num_heads
 
-    def post_load_weights(self, load_in_8bit: bool = False):
+    def post_load_weights(self, quantize: Optional[str] = None):
         if isinstance(self.embed_tokens, TensorParallelEmbedding):
             self.embed_tokens.add_null_idx()
         for layer in self.layers:
             layer: FlashLlamaLayer
-            layer.self_attn.query_key_value.prepare_weights(load_in_8bit)
-            layer.self_attn.o_proj.prepare_weights(load_in_8bit)
-            layer.mlp.gate_up_proj.prepare_weights(load_in_8bit)
-            layer.mlp.down_proj.prepare_weights(load_in_8bit)
+            layer.self_attn.query_key_value.prepare_weights(quantize)
+            layer.self_attn.o_proj.prepare_weights(quantize)
+            layer.mlp.gate_up_proj.prepare_weights(quantize)
+            layer.mlp.down_proj.prepare_weights(quantize)
 
     def forward(
         self,
@@ -429,8 +427,8 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         else:
             self.lm_head = FastLinear(config.hidden_size, config.vocab_size, bias=False)
 
-    def post_load_weights(self, load_in_8bit: bool = False):
-        self.model.post_load_weights(load_in_8bit)
+    def post_load_weights(self, quantize: Optional[str] = None):
+        self.model.post_load_weights(quantize)
         self.lm_head.prepare_weights()
 
     def forward(
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index 369e8d4f..2c6b8da6 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -21,8 +21,6 @@
 import torch
 import torch.distributed
 
-from torch.nn import functional as F
-
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
@@ -32,7 +30,6 @@ from typing import Optional
 # Flash attention imports
 import flash_attn_cuda
 
-from flash_attn.layers.rotary import RotaryEmbedding
 from text_generation_server.utils.layers import (
     FastLinear,
     TensorParallelRowLinear,
@@ -345,16 +342,16 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
         self.head_size = self.layers[0].attention.head_size
         self.num_heads = self.layers[0].attention.num_heads
 
-    def post_load_weights(self, load_in_8bit=False):
+    def post_load_weights(self, quantize: Optional[str] = None):
         if isinstance(self.embed_in, TensorParallelEmbedding):
             self.embed_in.add_null_idx()
         for layer in self.layers:
             layer: FlashNeoXLayer
             layer.attention.shuffle_qkv_dims()
-            layer.attention.query_key_value.prepare_weights(load_in_8bit)
-            layer.attention.dense.prepare_weights(load_in_8bit)
-            layer.mlp.dense_h_to_4h.prepare_weights(load_in_8bit)
-            layer.mlp.dense_4h_to_h.prepare_weights(load_in_8bit)
+            layer.attention.query_key_value.prepare_weights(quantize)
+            layer.attention.dense.prepare_weights(quantize)
+            layer.mlp.dense_h_to_4h.prepare_weights(quantize)
+            layer.mlp.dense_4h_to_h.prepare_weights(quantize)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -457,8 +454,8 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
                 config.hidden_size, config.vocab_size, bias=False
             )
 
-    def post_load_weights(self, load_in_8bit=False):
-        self.gpt_neox.post_load_weights(load_in_8bit)
+    def post_load_weights(self, quantize: Optional[str] = None):
+        self.gpt_neox.post_load_weights(quantize)
         self.embed_out.prepare_weights()
 
     @classmethod
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 9451b01a..9bded805 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -1,8 +1,6 @@
 import torch
 import torch.distributed
 
-import torch.nn.functional as F
-
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional
@@ -261,16 +259,16 @@ class FlashSantacoderModel(nn.Module):
         self.head_size = self.h[0].attn.head_size
         self.num_heads = self.h[0].attn.num_heads
 
-    def post_load_weights(self, load_in_8bit: bool = False):
+    def post_load_weights(self, quantize: Optional[str] = None):
         if self.tp_embeddings:
             self.wte.add_null_idx()
             self.wpe.add_null_idx()
         for layer in self.h:
             layer: Block
-            layer.attn.c_attn.prepare_weights(load_in_8bit)
-            layer.attn.c_proj.prepare_weights(load_in_8bit)
-            layer.mlp.c_fc.prepare_weights(load_in_8bit)
-            layer.mlp.c_proj.prepare_weights(load_in_8bit)
+            layer.attn.c_attn.prepare_weights(quantize)
+            layer.attn.c_proj.prepare_weights(quantize)
+            layer.mlp.c_fc.prepare_weights(quantize)
+            layer.mlp.c_proj.prepare_weights(quantize)
 
     def forward(
         self,
@@ -347,8 +345,8 @@ class FlashSantacoderForCausalLM(nn.Module):
         else:
             self.lm_head = FastLinear(config.hidden_size, config.vocab_size, bias=False)
 
-    def post_load_weights(self, load_in_8bit: bool = False):
-        self.transformer.post_load_weights(load_in_8bit)
+    def post_load_weights(self, quantize: Optional[str] = None):
+        self.transformer.post_load_weights(quantize)
         self.lm_head.prepare_weights()
 
     def forward(
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index aa0b4483..b775bd79 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -28,7 +28,12 @@ tracer = trace.get_tracer(__name__)
 
 
 class FlashLlama(FlashCausalLM):
-    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+    ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16
@@ -72,14 +77,14 @@ class FlashLlama(FlashCausalLM):
     def load_weights(
         model,
         filenames: List[Path],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
     ):
         for filename in filenames:
             state_dict = torch.load(filename, map_location="cpu")
             for key, value in state_dict.items():
-                value = value.to(device if not quantize else "cpu").to(dtype)
+                value = value.to(device if quantize is None else "cpu").to(dtype)
 
                 layer_name = ".".join(key.split(".")[:4])
 
@@ -199,7 +204,7 @@ class FlashLlamaSharded(FlashLlama):
     def load_weights(
         model,
         filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
         rank: int,
@@ -207,7 +212,7 @@ class FlashLlamaSharded(FlashLlama):
     ):
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     slice_ = f.get_slice(name)
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index fc741f55..0924f107 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -23,7 +23,12 @@ tracer = trace.get_tracer(__name__)
 
 
 class FlashNeoX(FlashCausalLM):
-    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+    ):
         super(FlashNeoX, self).__init__(
             FlashGPTNeoXForCausalLM, model_id, revision, quantize
         )
@@ -31,7 +36,10 @@ class FlashNeoX(FlashCausalLM):
 
 class FlashNeoXSharded(FlashNeoX):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
@@ -89,7 +97,7 @@ class FlashNeoXSharded(FlashNeoX):
         parameters = dict(model.named_parameters())
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     module_name, param_name = name.rsplit(".", 1)
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index f810bb0b..031a67eb 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -27,7 +27,12 @@ tracer = trace.get_tracer(__name__)
 
 
 class FlashSantacoder(FlashCausalLM):
-    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+    ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16
@@ -84,7 +89,7 @@ class FlashSantacoder(FlashCausalLM):
         for filename in filenames:
             state_dict = torch.load(filename, map_location="cpu")
             for key, value in state_dict.items():
-                value = value.to(device if not quantize else "cpu").to(dtype)
+                value = value.to(device if quantize is None else "cpu").to(dtype)
 
                 layer_name = ".".join(key.split(".")[:4])
 
@@ -170,7 +175,10 @@ class FlashSantacoder(FlashCausalLM):
 
 class FlashSantacoderSharded(FlashSantacoder):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
@@ -221,7 +229,7 @@ class FlashSantacoderSharded(FlashSantacoder):
     def load_weights(
         model,
         filenames: List[str],
-        quantize: bool,
+        quantize: Optional[str],
         device: torch.device,
         dtype: torch.dtype,
         rank: int,
@@ -230,7 +238,7 @@ class FlashSantacoderSharded(FlashSantacoder):
     ):
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for key in f.keys():
                     slice_ = f.get_slice(key)
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index a0111250..d1e5e841 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -255,7 +255,7 @@ class GalacticaSharded(Galactica):
         parameters = dict(model.named_parameters())
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     if name == "lm_head.weight":
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index 3e8557b2..f95e5be2 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -94,7 +94,7 @@ class GPTNeoxSharded(CausalLM):
         parameters = dict(model.named_parameters())
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     module_name, param_name = name.rsplit(".", 1)
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index c83c3351..093cf70a 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -48,7 +48,10 @@ class OPT(CausalLM):
 
 class OPTSharded(OPT):
     def __init__(
-        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
@@ -107,7 +110,7 @@ class OPTSharded(OPT):
         parameters = dict(model.named_parameters())
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     if name == "lm_head.weight":
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index c8521dbf..8e3826a4 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -97,7 +97,7 @@ class T5Sharded(Seq2SeqLM):
         parameters = dict(model.named_parameters())
         for file in filenames:
             with safe_open(
-                file, framework="pt", device=str(device) if not quantize else "cpu"
+                file, framework="pt", device=str(device) if quantize is None else "cpu"
             ) as f:
                 for name in f.keys():
                     module_name, param_name = name.rsplit(".", 1)
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
index 3383bf4b..7605639d 100644
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -1,6 +1,8 @@
 import torch
 
 from torch import nn
+from torch.nn import functional as F
+from typing import Optional
 
 HAS_BITS_AND_BYTES = True
 try:
@@ -22,7 +24,7 @@ class FastLinear(nn.Linear):
         self.quantized = False
         self.bnb_linear = None
 
-    def prepare_weights(self, quantize: bool = False):
+    def prepare_weights(self, quantize: Optional[str] = None):
         if quantize == "bitsandbytes":
             if not HAS_BITS_AND_BYTES:
                 raise ImportError(
@@ -126,6 +128,7 @@ class TensorParallelEmbedding(nn.Embedding):
         num_embeddings,
         embedding_dim,
         process_group: torch.distributed.ProcessGroup,
+        reduce=True,
         padding_idx=None,
         max_norm=None,
         norm_type=2.0,
@@ -135,6 +138,7 @@ class TensorParallelEmbedding(nn.Embedding):
         device=None,
         dtype=None,
     ):
+        self.reduce = reduce
         self.process_group = process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
@@ -177,7 +181,8 @@ class TensorParallelEmbedding(nn.Embedding):
             input - self.min_id,
         )
         out = super().forward(input)
-        torch.distributed.all_reduce(out, group=self.process_group)
+        if self.reduce:
+            torch.distributed.all_reduce(out, group=self.process_group)
         return out
 
 
From dbdc587dddf0f16b7c05b28fb632acf9f65f185f Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 16 May 2023 20:22:11 +0200
Subject: [PATCH 38/39] feat(integration-tests): improve comparison and health
 checks (#336)

---
 .github/workflows/build.yaml                  |   1 +
 clients/python/pyproject.toml                 |   2 +-
 clients/python/tests/test_client.py           |  12 +-
 clients/python/text_generation/types.py       |   2 +-
 integration-tests/conftest.py                 | 197 ++++-
 .../models/__snapshots__/test_bloom_560m.ambr | 627 ----------------
 .../test_bloom_560m/test_bloom_560m.json      | 128 ++++
 .../test_bloom_560m_all_params.json           |  98 +++
 .../test_bloom_560m/test_bloom_560m_load.json | 514 +++++++++++++
 .../test_bloom_560m_sharded.ambr              | 542 --------------
 .../test_bloom_560m_sharded.json              | 128 ++++
 .../test_bloom_560m_sharded_load.json         | 514 +++++++++++++
 .../__snapshots__/test_flash_llama.ambr       | 465 ------------
 .../test_flash_llama/test_flash_llama.json    |  88 +++
 .../test_flash_llama_all_params.json          |  88 +++
 .../test_flash_llama_load.json                | 354 +++++++++
 .../models/__snapshots__/test_flash_neox.ambr | 682 ------------------
 .../test_flash_neox/test_flash_neox.json      | 163 +++++
 .../test_flash_neox/test_flash_neox_load.json | 654 +++++++++++++++++
 .../__snapshots__/test_flash_santacoder.ambr  | 472 ------------
 .../test_flash_santacoder.json                |  93 +++
 .../test_flash_santacoder_load.json           | 374 ++++++++++
 .../__snapshots__/test_flash_starcoder.ambr   | 573 ---------------
 .../test_flash_starcoder.json                 |  93 +++
 .../test_flash_starcoder_default_params.json  | 105 +++
 .../test_flash_starcoder_load.json            | 374 ++++++++++
 .../models/__snapshots__/test_mt0_base.ambr   | 306 --------
 .../test_mt0_base/test_mt0_base.json          |  48 ++
 .../test_mt0_base_all_params.json             |  78 ++
 .../test_mt0_base/test_mt0_base_load.json     | 218 ++++++
 integration-tests/models/test_bloom_560m.py   |  33 +-
 .../models/test_bloom_560m_sharded.py         |  27 +-
 integration-tests/models/test_flash_llama.py  |  33 +-
 integration-tests/models/test_flash_neox.py   |  27 +-
 .../models/test_flash_santacoder.py           |  29 +-
 .../models/test_flash_starcoder.py            |  33 +-
 integration-tests/models/test_mt0_base.py     |  33 +-
 integration-tests/models/utils.py             |  15 -
 integration-tests/requirements.txt            |   2 +-
 39 files changed, 4416 insertions(+), 3809 deletions(-)
 delete mode 100644 integration-tests/models/__snapshots__/test_bloom_560m.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
 create mode 100644 integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_flash_llama.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_flash_neox.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_flash_santacoder.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_mt0_base.ambr
 create mode 100644 integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
 create mode 100644 integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
 delete mode 100644 integration-tests/models/utils.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c2aba160..79b3c777 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -10,6 +10,7 @@ on:
   pull_request:
     paths:
       - ".github/workflows/build.yaml"
+      - "integration-tests/**"
       - "server/**"
       - "proto/**"
       - "router/**"
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 00bb99fc..06d5f9cb 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.5.1"
+version = "0.5.2"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
index 8972dfd1..32462f14 100644
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@@ -16,9 +16,9 @@ def test_generate(flan_t5_xxl_url, hf_headers):
     assert len(response.details.prefill) == 1
     assert response.details.prefill[0] == PrefillToken(id=0, text="<pad>", logprob=None)
     assert len(response.details.tokens) == 1
-    assert response.details.tokens[0] == Token(
-        id=3, text="", logprob=-1.984375, special=False
-    )
+    assert response.details.tokens[0].id == 3
+    assert response.details.tokens[0].text == ""
+    assert not response.details.tokens[0].special
 
 
 def test_generate_best_of(flan_t5_xxl_url, hf_headers):
@@ -82,9 +82,9 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers):
     assert len(response.details.prefill) == 1
     assert response.details.prefill[0] == PrefillToken(id=0, text="<pad>", logprob=None)
     assert len(response.details.tokens) == 1
-    assert response.details.tokens[0] == Token(
-        id=3, text="", logprob=-1.984375, special=False
-    )
+    assert response.details.tokens[0].id == 3
+    assert response.details.tokens[0].text == ""
+    assert not response.details.tokens[0].special
 
 
 @pytest.mark.asyncio
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index f3f9dcb5..ad3cd09b 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -154,7 +154,7 @@ class Token(BaseModel):
 
 
 # Generation finish reason
-class FinishReason(Enum):
+class FinishReason(str, Enum):
     # number of generated tokens == `max_new_tokens`
     Length = "length"
     # the model generated its end of sequence token
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index e9c51c37..ba1abca9 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -4,22 +4,192 @@ import pytest
 import asyncio
 import os
 import docker
+import json
+import math
+import time
 
 from docker.errors import NotFound
-from typing import Optional, List
-from syrupy.filters import props
+from typing import Optional, List, Dict
+from syrupy.extensions.json import JSONSnapshotExtension
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 
 from text_generation import AsyncClient
-from text_generation.types import Response
+from text_generation.types import Response, Details, PrefillToken, Token, BestOfSequence
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
 HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 
 
+class ResponseComparator(JSONSnapshotExtension):
+    def serialize(
+        self,
+        data,
+        *,
+        exclude=None,
+        matcher=None,
+    ):
+        if isinstance(data, List):
+            data = [d.dict() for d in data]
+
+        data = self._filter(
+            data=data, depth=0, path=(), exclude=exclude, matcher=matcher
+        )
+        return json.dumps(data, indent=2, ensure_ascii=False, sort_keys=False) + "\n"
+
+    def matches(
+        self,
+        *,
+        serialized_data,
+        snapshot_data,
+    ) -> bool:
+        def convert_data(data):
+            data = json.loads(data)
+
+            if isinstance(data, Dict):
+                return Response(**data)
+            if isinstance(data, List):
+                return [Response(**d) for d in data]
+            raise NotImplementedError
+
+        def eq_token(token: Token, other: Token) -> bool:
+            return (
+                token.id == other.id
+                and token.text == other.text
+                and math.isclose(token.logprob, other.logprob, rel_tol=0.2)
+                and token.special == other.special
+            )
+
+        def eq_prefill_token(prefill_token: PrefillToken, other: PrefillToken) -> bool:
+            try:
+                return (
+                    prefill_token.id == other.id
+                    and prefill_token.text == other.text
+                    and (
+                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=0.2)
+                        if prefill_token.logprob is not None
+                        else prefill_token.logprob == other.logprob
+                    )
+                )
+            except TypeError:
+                return False
+
+        def eq_best_of(details: BestOfSequence, other: BestOfSequence) -> bool:
+            return (
+                details.finish_reason == other.finish_reason
+                and details.generated_tokens == other.generated_tokens
+                and details.seed == other.seed
+                and len(details.prefill) == len(other.prefill)
+                and all(
+                    [
+                        eq_prefill_token(d, o)
+                        for d, o in zip(details.prefill, other.prefill)
+                    ]
+                )
+                and len(details.tokens) == len(other.tokens)
+                and all([eq_token(d, o) for d, o in zip(details.tokens, other.tokens)])
+            )
+
+        def eq_details(details: Details, other: Details) -> bool:
+            return (
+                details.finish_reason == other.finish_reason
+                and details.generated_tokens == other.generated_tokens
+                and details.seed == other.seed
+                and len(details.prefill) == len(other.prefill)
+                and all(
+                    [
+                        eq_prefill_token(d, o)
+                        for d, o in zip(details.prefill, other.prefill)
+                    ]
+                )
+                and len(details.tokens) == len(other.tokens)
+                and all([eq_token(d, o) for d, o in zip(details.tokens, other.tokens)])
+                and (
+                    len(details.best_of_sequences)
+                    if details.best_of_sequences is not None
+                    else 0
+                )
+                == (
+                    len(other.best_of_sequences)
+                    if other.best_of_sequences is not None
+                    else 0
+                )
+                and (
+                    all(
+                        [
+                            eq_best_of(d, o)
+                            for d, o in zip(
+                                details.best_of_sequences, other.best_of_sequences
+                            )
+                        ]
+                    )
+                    if details.best_of_sequences is not None
+                    else details.best_of_sequences == other.best_of_sequences
+                )
+            )
+
+        def eq_response(response: Response, other: Response) -> bool:
+            return response.generated_text == other.generated_text and eq_details(
+                response.details, other.details
+            )
+
+        serialized_data = convert_data(serialized_data)
+        snapshot_data = convert_data(snapshot_data)
+
+        if not isinstance(serialized_data, List):
+            serialized_data = [serialized_data]
+        if not isinstance(snapshot_data, List):
+            snapshot_data = [snapshot_data]
+
+        return len(snapshot_data) == len(serialized_data) and all(
+            [eq_response(r, o) for r, o in zip(serialized_data, snapshot_data)]
+        )
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.client = AsyncClient(f"http://localhost:{port}")
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        for _ in range(timeout):
+            if not self._inner_health():
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                await self.client.generate("test")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+                time.sleep(1)
+        raise RuntimeError("Health check failed")
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        container = self.docker_client.containers.get(self.container_name)
+        return container.status in ["running", "created"]
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
 @pytest.fixture
-def snapshot_test(snapshot):
-    return lambda value: value == snapshot(exclude=props("logprob"))
+def response_snapshot(snapshot):
+    return snapshot.use_extension(ResponseComparator)
 
 
 @pytest.fixture(scope="module")
@@ -60,7 +230,7 @@ def launcher(event_loop):
         with subprocess.Popen(
             args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
         ) as process:
-            yield AsyncClient(f"http://localhost:{port}")
+            yield ProcessLauncherHandle(process, port)
 
             process.terminate()
             process.wait(60)
@@ -110,7 +280,7 @@ def launcher(event_loop):
             command=args,
             name=container_name,
             environment=env,
-            auto_remove=True,
+            auto_remove=False,
             detach=True,
             device_requests=[
                 docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
@@ -119,13 +289,19 @@ def launcher(event_loop):
             ports={"80/tcp": port},
         )
 
-        yield AsyncClient(f"http://localhost:{port}")
+        yield ContainerLauncherHandle(client, container.name, port)
 
-        container.stop()
+        try:
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
 
         container_output = container.logs().decode("utf-8")
         print(container_output)
 
+        container.remove()
+
     if DOCKER_IMAGE is not None:
         return docker_launcher
     return local_launcher
@@ -140,7 +316,6 @@ def generate_load():
             client.generate(prompt, max_new_tokens=max_new_tokens) for _ in range(n)
         ]
 
-        results = await asyncio.gather(*futures)
-        return [r.dict() for r in results]
+        return await asyncio.gather(*futures)
 
     return generate_load_inner
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m.ambr b/integration-tests/models/__snapshots__/test_bloom_560m.ambr
deleted file mode 100644
index 1067513d..00000000
--- a/integration-tests/models/__snapshots__/test_bloom_560m.ambr
+++ /dev/null
@@ -1,627 +0,0 @@
-# serializer version: 1
-# name: test_bloom_560m
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 17934,
-          'text': 'Pour',
-        }),
-        dict({
-          'id': 49833,
-          'text': ' dég',
-        }),
-        dict({
-          'id': 21543,
-          'text': 'uster',
-        }),
-        dict({
-          'id': 447,
-          'text': ' un',
-        }),
-        dict({
-          'id': 46341,
-          'text': ' ort',
-        }),
-        dict({
-          'id': 35567,
-          'text': 'olan',
-        }),
-        dict({
-          'id': 15,
-          'text': ',',
-        }),
-        dict({
-          'id': 1669,
-          'text': ' il',
-        }),
-        dict({
-          'id': 11580,
-          'text': ' faut',
-        }),
-        dict({
-          'id': 3913,
-          'text': ' tout',
-        }),
-        dict({
-          'id': 39261,
-          'text': " d'abord",
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 578,
-          'special': False,
-          'text': ' le',
-        }),
-        dict({
-          'id': 5608,
-          'special': False,
-          'text': ' faire',
-        }),
-        dict({
-          'id': 159570,
-          'special': False,
-          'text': ' réch',
-        }),
-        dict({
-          'id': 810,
-          'special': False,
-          'text': 'au',
-        }),
-        dict({
-          'id': 12736,
-          'special': False,
-          'text': 'ffer',
-        }),
-        dict({
-          'id': 1742,
-          'special': False,
-          'text': ' au',
-        }),
-        dict({
-          'id': 6105,
-          'special': False,
-          'text': ' bain',
-        }),
-        dict({
-          'id': 88254,
-          'special': False,
-          'text': '-mar',
-        }),
-        dict({
-          'id': 641,
-          'special': False,
-          'text': 'ie',
-        }),
-        dict({
-          'id': 2940,
-          'special': False,
-          'text': ' avec',
-        }),
-      ]),
-    }),
-    'generated_text': ' le faire réchauffer au bain-marie avec',
-  })
-# ---
-# name: test_bloom_560m_all_params
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 15,
-          'text': ',',
-        }),
-        dict({
-          'id': 1669,
-          'text': ' il',
-        }),
-        dict({
-          'id': 11580,
-          'text': ' faut',
-        }),
-        dict({
-          'id': 3913,
-          'text': ' tout',
-        }),
-        dict({
-          'id': 39261,
-          'text': " d'abord",
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 408,
-          'special': False,
-          'text': ' que',
-        }),
-        dict({
-          'id': 20288,
-          'special': False,
-          'text': " l'on",
-        }),
-        dict({
-          'id': 22255,
-          'special': False,
-          'text': ' trouve',
-        }),
-        dict({
-          'id': 1622,
-          'special': False,
-          'text': ' une',
-        }),
-        dict({
-          'id': 187079,
-          'special': False,
-          'text': ' posture',
-        }),
-        dict({
-          'id': 501,
-          'special': False,
-          'text': ' par',
-        }),
-        dict({
-          'id': 8741,
-          'special': False,
-          'text': ' rapport',
-        }),
-        dict({
-          'id': 693,
-          'special': False,
-          'text': ' à',
-        }),
-        dict({
-          'id': 366,
-          'special': False,
-          'text': ' la',
-        }),
-        dict({
-          'id': 36503,
-          'special': False,
-          'text': ' pratique',
-        }),
-      ]),
-    }),
-    'generated_text': "Pour déguster un ortolan, il faut tout d'abord que l'on trouve une posture par rapport à la pratique",
-  })
-# ---
-# name: test_bloom_560m_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
new file mode 100644
index 00000000..53a4ab85
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
@@ -0,0 +1,128 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 17934,
+        "logprob": null,
+        "text": "Pour"
+      },
+      {
+        "id": 49833,
+        "logprob": -10.5625,
+        "text": " dég"
+      },
+      {
+        "id": 21543,
+        "logprob": -0.14770508,
+        "text": "uster"
+      },
+      {
+        "id": 447,
+        "logprob": -1.9287109,
+        "text": " un"
+      },
+      {
+        "id": 46341,
+        "logprob": -15.4609375,
+        "text": " ort"
+      },
+      {
+        "id": 35567,
+        "logprob": -7.5585938,
+        "text": "olan"
+      },
+      {
+        "id": 15,
+        "logprob": -1.4003906,
+        "text": ","
+      },
+      {
+        "id": 1669,
+        "logprob": -1.5673828,
+        "text": " il"
+      },
+      {
+        "id": 11580,
+        "logprob": -0.94628906,
+        "text": " faut"
+      },
+      {
+        "id": 3913,
+        "logprob": -3.703125,
+        "text": " tout"
+      },
+      {
+        "id": 39261,
+        "logprob": -1.5732422,
+        "text": " d'abord"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 578,
+        "logprob": -1.6591797,
+        "special": false,
+        "text": " le"
+      },
+      {
+        "id": 5608,
+        "logprob": -2.4492188,
+        "special": false,
+        "text": " faire"
+      },
+      {
+        "id": 159570,
+        "logprob": -6.6835938,
+        "special": false,
+        "text": " réch"
+      },
+      {
+        "id": 810,
+        "logprob": 0.0,
+        "special": false,
+        "text": "au"
+      },
+      {
+        "id": 12736,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ffer"
+      },
+      {
+        "id": 1742,
+        "logprob": -2.5175781,
+        "special": false,
+        "text": " au"
+      },
+      {
+        "id": 6105,
+        "logprob": -2.0078125,
+        "special": false,
+        "text": " bain"
+      },
+      {
+        "id": 88254,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": "-mar"
+      },
+      {
+        "id": 641,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ie"
+      },
+      {
+        "id": 2940,
+        "logprob": -3.5175781,
+        "special": false,
+        "text": " avec"
+      }
+    ]
+  },
+  "generated_text": " le faire réchauffer au bain-marie avec"
+}
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
new file mode 100644
index 00000000..93a95804
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
@@ -0,0 +1,98 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 15,
+        "logprob": null,
+        "text": ","
+      },
+      {
+        "id": 1669,
+        "logprob": -5.4414062,
+        "text": " il"
+      },
+      {
+        "id": 11580,
+        "logprob": -2.3378906,
+        "text": " faut"
+      },
+      {
+        "id": 3913,
+        "logprob": -4.3554688,
+        "text": " tout"
+      },
+      {
+        "id": 39261,
+        "logprob": -2.9238281,
+        "text": " d'abord"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 408,
+        "logprob": -1.9267578,
+        "special": false,
+        "text": " que"
+      },
+      {
+        "id": 20288,
+        "logprob": -2.9257812,
+        "special": false,
+        "text": " l'on"
+      },
+      {
+        "id": 22255,
+        "logprob": -2.8964844,
+        "special": false,
+        "text": " trouve"
+      },
+      {
+        "id": 1622,
+        "logprob": -1.1083984,
+        "special": false,
+        "text": " une"
+      },
+      {
+        "id": 187079,
+        "logprob": -7.796875,
+        "special": false,
+        "text": " posture"
+      },
+      {
+        "id": 501,
+        "logprob": -5.390625,
+        "special": false,
+        "text": " par"
+      },
+      {
+        "id": 8741,
+        "logprob": -0.34936523,
+        "special": false,
+        "text": " rapport"
+      },
+      {
+        "id": 693,
+        "logprob": 0.0,
+        "special": false,
+        "text": " à"
+      },
+      {
+        "id": 366,
+        "logprob": -2.3378906,
+        "special": false,
+        "text": " la"
+      },
+      {
+        "id": 36503,
+        "logprob": -3.6640625,
+        "special": false,
+        "text": " pratique"
+      }
+    ]
+  },
+  "generated_text": "Pour déguster un ortolan, il faut tout d'abord que l'on trouve une posture par rapport à la pratique"
+}
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
new file mode 100644
index 00000000..0a86bef8
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
@@ -0,0 +1,514 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.5625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4609375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5585938,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4003906,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5673828,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94628906,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.703125,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7646484,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.6113281,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5263672,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -0.00010049343,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.4707031,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.2119141,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11883545,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.40844727,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.0037841797,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0195312,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.53125,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4140625,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5234375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3613281,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5458984,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94189453,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7548828,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5117188,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -0.00010049343,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.4707031,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11004639,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.4506836,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.003047943,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0185547,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.53125,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4140625,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5234375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3613281,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5458984,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94189453,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7548828,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5117188,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -0.00010049343,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.4707031,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11004639,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.4506836,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.003047943,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0185547,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.53125,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14770508,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4140625,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.5234375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3613281,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5458984,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94189453,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7548828,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5117188,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -0.00010049343,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.4707031,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11004639,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.4506836,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.003047943,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0185547,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr b/integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr
deleted file mode 100644
index 667a0373..00000000
--- a/integration-tests/models/__snapshots__/test_bloom_560m_sharded.ambr
+++ /dev/null
@@ -1,542 +0,0 @@
-# serializer version: 1
-# name: test_bloom_560m_sharded
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 17934,
-          'text': 'Pour',
-        }),
-        dict({
-          'id': 49833,
-          'text': ' dég',
-        }),
-        dict({
-          'id': 21543,
-          'text': 'uster',
-        }),
-        dict({
-          'id': 447,
-          'text': ' un',
-        }),
-        dict({
-          'id': 46341,
-          'text': ' ort',
-        }),
-        dict({
-          'id': 35567,
-          'text': 'olan',
-        }),
-        dict({
-          'id': 15,
-          'text': ',',
-        }),
-        dict({
-          'id': 1669,
-          'text': ' il',
-        }),
-        dict({
-          'id': 11580,
-          'text': ' faut',
-        }),
-        dict({
-          'id': 3913,
-          'text': ' tout',
-        }),
-        dict({
-          'id': 39261,
-          'text': " d'abord",
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 578,
-          'special': False,
-          'text': ' le',
-        }),
-        dict({
-          'id': 5608,
-          'special': False,
-          'text': ' faire',
-        }),
-        dict({
-          'id': 159570,
-          'special': False,
-          'text': ' réch',
-        }),
-        dict({
-          'id': 810,
-          'special': False,
-          'text': 'au',
-        }),
-        dict({
-          'id': 12736,
-          'special': False,
-          'text': 'ffer',
-        }),
-        dict({
-          'id': 1742,
-          'special': False,
-          'text': ' au',
-        }),
-        dict({
-          'id': 6105,
-          'special': False,
-          'text': ' bain',
-        }),
-        dict({
-          'id': 88254,
-          'special': False,
-          'text': '-mar',
-        }),
-        dict({
-          'id': 641,
-          'special': False,
-          'text': 'ie',
-        }),
-        dict({
-          'id': 2940,
-          'special': False,
-          'text': ' avec',
-        }),
-      ]),
-    }),
-    'generated_text': ' le faire réchauffer au bain-marie avec',
-  })
-# ---
-# name: test_bloom_560m_sharded_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 17934,
-            'text': 'Pour',
-          }),
-          dict({
-            'id': 49833,
-            'text': ' dég',
-          }),
-          dict({
-            'id': 21543,
-            'text': 'uster',
-          }),
-          dict({
-            'id': 447,
-            'text': ' un',
-          }),
-          dict({
-            'id': 46341,
-            'text': ' ort',
-          }),
-          dict({
-            'id': 35567,
-            'text': 'olan',
-          }),
-          dict({
-            'id': 15,
-            'text': ',',
-          }),
-          dict({
-            'id': 1669,
-            'text': ' il',
-          }),
-          dict({
-            'id': 11580,
-            'text': ' faut',
-          }),
-          dict({
-            'id': 3913,
-            'text': ' tout',
-          }),
-          dict({
-            'id': 39261,
-            'text': " d'abord",
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 578,
-            'special': False,
-            'text': ' le',
-          }),
-          dict({
-            'id': 5608,
-            'special': False,
-            'text': ' faire',
-          }),
-          dict({
-            'id': 1767,
-            'special': False,
-            'text': ' cu',
-          }),
-          dict({
-            'id': 1273,
-            'special': False,
-            'text': 'ire',
-          }),
-          dict({
-            'id': 1486,
-            'special': False,
-            'text': ' dans',
-          }),
-          dict({
-            'id': 283,
-            'special': False,
-            'text': ' de',
-          }),
-          dict({
-            'id': 40410,
-            'special': False,
-            'text': " l'eau",
-          }),
-          dict({
-            'id': 20226,
-            'special': False,
-            'text': ' bou',
-          }),
-          dict({
-            'id': 172483,
-            'special': False,
-            'text': 'illante',
-          }),
-          dict({
-            'id': 2805,
-            'special': False,
-            'text': ' sal',
-          }),
-        ]),
-      }),
-      'generated_text': " le faire cuire dans de l'eau bouillante sal",
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
new file mode 100644
index 00000000..dd8936af
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
@@ -0,0 +1,128 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 17934,
+        "logprob": null,
+        "text": "Pour"
+      },
+      {
+        "id": 49833,
+        "logprob": -10.5390625,
+        "text": " dég"
+      },
+      {
+        "id": 21543,
+        "logprob": -0.14758301,
+        "text": "uster"
+      },
+      {
+        "id": 447,
+        "logprob": -1.9296875,
+        "text": " un"
+      },
+      {
+        "id": 46341,
+        "logprob": -15.4453125,
+        "text": " ort"
+      },
+      {
+        "id": 35567,
+        "logprob": -7.59375,
+        "text": "olan"
+      },
+      {
+        "id": 15,
+        "logprob": -1.3994141,
+        "text": ","
+      },
+      {
+        "id": 1669,
+        "logprob": -1.578125,
+        "text": " il"
+      },
+      {
+        "id": 11580,
+        "logprob": -0.9453125,
+        "text": " faut"
+      },
+      {
+        "id": 3913,
+        "logprob": -3.7011719,
+        "text": " tout"
+      },
+      {
+        "id": 39261,
+        "logprob": -1.5732422,
+        "text": " d'abord"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 578,
+        "logprob": -1.6474609,
+        "special": false,
+        "text": " le"
+      },
+      {
+        "id": 5608,
+        "logprob": -2.5097656,
+        "special": false,
+        "text": " faire"
+      },
+      {
+        "id": 159570,
+        "logprob": -6.65625,
+        "special": false,
+        "text": " réch"
+      },
+      {
+        "id": 810,
+        "logprob": 0.0,
+        "special": false,
+        "text": "au"
+      },
+      {
+        "id": 12736,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ffer"
+      },
+      {
+        "id": 1742,
+        "logprob": -2.5859375,
+        "special": false,
+        "text": " au"
+      },
+      {
+        "id": 6105,
+        "logprob": -2.03125,
+        "special": false,
+        "text": " bain"
+      },
+      {
+        "id": 88254,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": "-mar"
+      },
+      {
+        "id": 641,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ie"
+      },
+      {
+        "id": 2940,
+        "logprob": -3.5175781,
+        "special": false,
+        "text": " avec"
+      }
+    ]
+  },
+  "generated_text": " le faire réchauffer au bain-marie avec"
+}
diff --git a/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
new file mode 100644
index 00000000..2dd480b9
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
@@ -0,0 +1,514 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.5390625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.14758301,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9296875,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.4453125,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.59375,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.3994141,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.578125,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.9453125,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.7011719,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.5732422,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7529297,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.6054688,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5283203,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -0.00010049343,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.4716797,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11853027,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.41210938,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.0037765503,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0166016,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.515625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.1484375,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.34375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.515625,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4199219,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5664062,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94091797,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.6660156,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.7753906,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7626953,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.5820312,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5097656,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -9.393692e-05,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.5175781,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11883545,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.4909668,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.003047943,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0185547,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.515625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.1484375,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.34375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.515625,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4199219,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5664062,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94091797,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.6660156,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.7753906,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7626953,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.5820312,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5097656,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -9.393692e-05,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.5175781,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11883545,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.4909668,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.003047943,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0185547,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 17934,
+          "logprob": null,
+          "text": "Pour"
+        },
+        {
+          "id": 49833,
+          "logprob": -10.515625,
+          "text": " dég"
+        },
+        {
+          "id": 21543,
+          "logprob": -0.1484375,
+          "text": "uster"
+        },
+        {
+          "id": 447,
+          "logprob": -1.9287109,
+          "text": " un"
+        },
+        {
+          "id": 46341,
+          "logprob": -15.34375,
+          "text": " ort"
+        },
+        {
+          "id": 35567,
+          "logprob": -7.515625,
+          "text": "olan"
+        },
+        {
+          "id": 15,
+          "logprob": -1.4199219,
+          "text": ","
+        },
+        {
+          "id": 1669,
+          "logprob": -1.5664062,
+          "text": " il"
+        },
+        {
+          "id": 11580,
+          "logprob": -0.94091797,
+          "text": " faut"
+        },
+        {
+          "id": 3913,
+          "logprob": -3.6660156,
+          "text": " tout"
+        },
+        {
+          "id": 39261,
+          "logprob": -1.7753906,
+          "text": " d'abord"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 578,
+          "logprob": -1.7626953,
+          "special": false,
+          "text": " le"
+        },
+        {
+          "id": 5608,
+          "logprob": -2.5820312,
+          "special": false,
+          "text": " faire"
+        },
+        {
+          "id": 1767,
+          "logprob": -1.5097656,
+          "special": false,
+          "text": " cu"
+        },
+        {
+          "id": 1273,
+          "logprob": -9.393692e-05,
+          "special": false,
+          "text": "ire"
+        },
+        {
+          "id": 1486,
+          "logprob": -1.5175781,
+          "special": false,
+          "text": " dans"
+        },
+        {
+          "id": 283,
+          "logprob": -1.1982422,
+          "special": false,
+          "text": " de"
+        },
+        {
+          "id": 40410,
+          "logprob": -0.11883545,
+          "special": false,
+          "text": " l'eau"
+        },
+        {
+          "id": 20226,
+          "logprob": -0.4909668,
+          "special": false,
+          "text": " bou"
+        },
+        {
+          "id": 172483,
+          "logprob": -0.003047943,
+          "special": false,
+          "text": "illante"
+        },
+        {
+          "id": 2805,
+          "logprob": -1.0185547,
+          "special": false,
+          "text": " sal"
+        }
+      ]
+    },
+    "generated_text": " le faire cuire dans de l'eau bouillante sal"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama.ambr b/integration-tests/models/__snapshots__/test_flash_llama.ambr
deleted file mode 100644
index f4e3a4c1..00000000
--- a/integration-tests/models/__snapshots__/test_flash_llama.ambr
+++ /dev/null
@@ -1,465 +0,0 @@
-# serializer version: 1
-# name: test_flash_llama
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 1,
-          'text': '<s>',
-        }),
-        dict({
-          'id': 4321,
-          'text': 'Test',
-        }),
-        dict({
-          'id': 2009,
-          'text': 'request',
-        }),
-      ]),
-      'seed': None,
-      'tokens': list([
-        dict({
-          'id': 363,
-          'special': False,
-          'text': ' for',
-        }),
-        dict({
-          'id': 847,
-          'special': False,
-          'text': ' /',
-        }),
-        dict({
-          'id': 2754,
-          'special': False,
-          'text': 'api',
-        }),
-        dict({
-          'id': 29914,
-          'special': False,
-          'text': '/',
-        }),
-        dict({
-          'id': 29894,
-          'special': False,
-          'text': 'v',
-        }),
-        dict({
-          'id': 29896,
-          'special': False,
-          'text': '1',
-        }),
-        dict({
-          'id': 29914,
-          'special': False,
-          'text': '/',
-        }),
-        dict({
-          'id': 16418,
-          'special': False,
-          'text': 'projects',
-        }),
-        dict({
-          'id': 29914,
-          'special': False,
-          'text': '/',
-        }),
-        dict({
-          'id': 29896,
-          'special': False,
-          'text': '1',
-        }),
-      ]),
-    }),
-    'generated_text': 'for /api/v1/projects/1',
-  })
-# ---
-# name: test_flash_llama_all_params
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 1,
-          'text': '<s>',
-        }),
-        dict({
-          'id': 4321,
-          'text': 'Test',
-        }),
-        dict({
-          'id': 2009,
-          'text': 'request',
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 5229,
-          'special': False,
-          'text': ' failed',
-        }),
-        dict({
-          'id': 363,
-          'special': False,
-          'text': ' for',
-        }),
-        dict({
-          'id': 5641,
-          'special': False,
-          'text': ' IP',
-        }),
-        dict({
-          'id': 16428,
-          'special': False,
-          'text': ' Address',
-        }),
-        dict({
-          'id': 29901,
-          'special': False,
-          'text': ':',
-        }),
-        dict({
-          'id': 525,
-          'special': False,
-          'text': " '",
-        }),
-        dict({
-          'id': 8516,
-          'special': False,
-          'text': 'None',
-        }),
-        dict({
-          'id': 4286,
-          'special': False,
-          'text': "'.",
-        }),
-        dict({
-          'id': 13,
-          'special': False,
-          'text': '''
-            
-  
-          ''',
-        }),
-        dict({
-          'id': 294,
-          'special': False,
-          'text': 'as',
-        }),
-      ]),
-    }),
-    'generated_text': '''
-      Test requestfailed for IP Address: 'None'.
-      as
-    ''',
-  })
-# ---
-# name: test_flash_llama_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 1,
-            'text': '<s>',
-          }),
-          dict({
-            'id': 4321,
-            'text': 'Test',
-          }),
-          dict({
-            'id': 2009,
-            'text': 'request',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 363,
-            'special': False,
-            'text': ' for',
-          }),
-          dict({
-            'id': 847,
-            'special': False,
-            'text': ' /',
-          }),
-          dict({
-            'id': 2754,
-            'special': False,
-            'text': 'api',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29894,
-            'special': False,
-            'text': 'v',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 16418,
-            'special': False,
-            'text': 'projects',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-        ]),
-      }),
-      'generated_text': 'for /api/v1/projects/1',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 1,
-            'text': '<s>',
-          }),
-          dict({
-            'id': 4321,
-            'text': 'Test',
-          }),
-          dict({
-            'id': 2009,
-            'text': 'request',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 363,
-            'special': False,
-            'text': ' for',
-          }),
-          dict({
-            'id': 847,
-            'special': False,
-            'text': ' /',
-          }),
-          dict({
-            'id': 2754,
-            'special': False,
-            'text': 'api',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29894,
-            'special': False,
-            'text': 'v',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 16418,
-            'special': False,
-            'text': 'projects',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-        ]),
-      }),
-      'generated_text': 'for /api/v1/projects/1',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 1,
-            'text': '<s>',
-          }),
-          dict({
-            'id': 4321,
-            'text': 'Test',
-          }),
-          dict({
-            'id': 2009,
-            'text': 'request',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 363,
-            'special': False,
-            'text': ' for',
-          }),
-          dict({
-            'id': 847,
-            'special': False,
-            'text': ' /',
-          }),
-          dict({
-            'id': 2754,
-            'special': False,
-            'text': 'api',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29894,
-            'special': False,
-            'text': 'v',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 16418,
-            'special': False,
-            'text': 'projects',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-        ]),
-      }),
-      'generated_text': 'for /api/v1/projects/1',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 1,
-            'text': '<s>',
-          }),
-          dict({
-            'id': 4321,
-            'text': 'Test',
-          }),
-          dict({
-            'id': 2009,
-            'text': 'request',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 363,
-            'special': False,
-            'text': ' for',
-          }),
-          dict({
-            'id': 847,
-            'special': False,
-            'text': ' /',
-          }),
-          dict({
-            'id': 2754,
-            'special': False,
-            'text': 'api',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29894,
-            'special': False,
-            'text': 'v',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 16418,
-            'special': False,
-            'text': 'projects',
-          }),
-          dict({
-            'id': 29914,
-            'special': False,
-            'text': '/',
-          }),
-          dict({
-            'id': 29896,
-            'special': False,
-            'text': '1',
-          }),
-        ]),
-      }),
-      'generated_text': 'for /api/v1/projects/1',
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
new file mode 100644
index 00000000..49bc996c
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
@@ -0,0 +1,88 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -8.6875,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -11.5546875,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 363,
+        "logprob": -1.5380859,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 847,
+        "logprob": -2.5917969,
+        "special": false,
+        "text": " /"
+      },
+      {
+        "id": 2754,
+        "logprob": -2.2773438,
+        "special": false,
+        "text": "api"
+      },
+      {
+        "id": 29914,
+        "logprob": -0.034362793,
+        "special": false,
+        "text": "/"
+      },
+      {
+        "id": 29894,
+        "logprob": -0.96533203,
+        "special": false,
+        "text": "v"
+      },
+      {
+        "id": 29896,
+        "logprob": -0.36669922,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29914,
+        "logprob": -0.013122559,
+        "special": false,
+        "text": "/"
+      },
+      {
+        "id": 16418,
+        "logprob": -3.1503906,
+        "special": false,
+        "text": "projects"
+      },
+      {
+        "id": 29914,
+        "logprob": -0.43652344,
+        "special": false,
+        "text": "/"
+      },
+      {
+        "id": 29896,
+        "logprob": -1.9404297,
+        "special": false,
+        "text": "1"
+      }
+    ]
+  },
+  "generated_text": "for /api/v1/projects/1"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
new file mode 100644
index 00000000..1b6b51a3
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
@@ -0,0 +1,88 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -8.6875,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -11.5546875,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5229,
+        "logprob": -3.3085938,
+        "special": false,
+        "text": " failed"
+      },
+      {
+        "id": 363,
+        "logprob": -3.984375,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 5641,
+        "logprob": -6.53125,
+        "special": false,
+        "text": " IP"
+      },
+      {
+        "id": 16428,
+        "logprob": -3.1835938,
+        "special": false,
+        "text": " Address"
+      },
+      {
+        "id": 29901,
+        "logprob": -1.2324219,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 525,
+        "logprob": -2.6855469,
+        "special": false,
+        "text": " '"
+      },
+      {
+        "id": 8516,
+        "logprob": -7.1601562,
+        "special": false,
+        "text": "None"
+      },
+      {
+        "id": 4286,
+        "logprob": -2.4433594,
+        "special": false,
+        "text": "'."
+      },
+      {
+        "id": 13,
+        "logprob": -0.06530762,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 294,
+        "logprob": -7.953125,
+        "special": false,
+        "text": "as"
+      }
+    ]
+  },
+  "generated_text": "Test requestfailed for IP Address: 'None'.\nas"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
new file mode 100644
index 00000000..5a8ba217
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
@@ -0,0 +1,354 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.6875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.5546875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 363,
+          "logprob": -1.5322266,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 847,
+          "logprob": -2.5585938,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2754,
+          "logprob": -2.265625,
+          "special": false,
+          "text": "api"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.034088135,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29894,
+          "logprob": -0.96240234,
+          "special": false,
+          "text": "v"
+        },
+        {
+          "id": 29896,
+          "logprob": -0.36816406,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.013191223,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 16418,
+          "logprob": -3.15625,
+          "special": false,
+          "text": "projects"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.43774414,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29896,
+          "logprob": -1.9443359,
+          "special": false,
+          "text": "1"
+        }
+      ]
+    },
+    "generated_text": "for /api/v1/projects/1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.6875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.5546875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 363,
+          "logprob": -1.5380859,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 847,
+          "logprob": -2.5859375,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2754,
+          "logprob": -2.2695312,
+          "special": false,
+          "text": "api"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.03439331,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29894,
+          "logprob": -0.96240234,
+          "special": false,
+          "text": "v"
+        },
+        {
+          "id": 29896,
+          "logprob": -0.36694336,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.013114929,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 16418,
+          "logprob": -3.1542969,
+          "special": false,
+          "text": "projects"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.43847656,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29896,
+          "logprob": -1.9433594,
+          "special": false,
+          "text": "1"
+        }
+      ]
+    },
+    "generated_text": "for /api/v1/projects/1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.6875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.5546875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 363,
+          "logprob": -1.5322266,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 847,
+          "logprob": -2.5585938,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2754,
+          "logprob": -2.265625,
+          "special": false,
+          "text": "api"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.034088135,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29894,
+          "logprob": -0.96240234,
+          "special": false,
+          "text": "v"
+        },
+        {
+          "id": 29896,
+          "logprob": -0.36816406,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.013191223,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 16418,
+          "logprob": -3.15625,
+          "special": false,
+          "text": "projects"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.43774414,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29896,
+          "logprob": -1.9443359,
+          "special": false,
+          "text": "1"
+        }
+      ]
+    },
+    "generated_text": "for /api/v1/projects/1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.6875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.5546875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 363,
+          "logprob": -1.5322266,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 847,
+          "logprob": -2.5585938,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2754,
+          "logprob": -2.265625,
+          "special": false,
+          "text": "api"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.034088135,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29894,
+          "logprob": -0.96240234,
+          "special": false,
+          "text": "v"
+        },
+        {
+          "id": 29896,
+          "logprob": -0.36816406,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.013191223,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 16418,
+          "logprob": -3.15625,
+          "special": false,
+          "text": "projects"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.43774414,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29896,
+          "logprob": -1.9443359,
+          "special": false,
+          "text": "1"
+        }
+      ]
+    },
+    "generated_text": "for /api/v1/projects/1"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_neox.ambr b/integration-tests/models/__snapshots__/test_flash_neox.ambr
deleted file mode 100644
index 4330db6b..00000000
--- a/integration-tests/models/__snapshots__/test_flash_neox.ambr
+++ /dev/null
@@ -1,682 +0,0 @@
-# serializer version: 1
-# name: test_flash_neox
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 50278,
-          'text': '<|prompter|>',
-        }),
-        dict({
-          'id': 1276,
-          'text': 'What',
-        }),
-        dict({
-          'id': 310,
-          'text': ' is',
-        }),
-        dict({
-          'id': 247,
-          'text': ' a',
-        }),
-        dict({
-          'id': 1167,
-          'text': ' mem',
-        }),
-        dict({
-          'id': 70,
-          'text': 'e',
-        }),
-        dict({
-          'id': 13,
-          'text': ',',
-        }),
-        dict({
-          'id': 285,
-          'text': ' and',
-        }),
-        dict({
-          'id': 752,
-          'text': ' what',
-        }),
-        dict({
-          'id': 434,
-          'text': "'s",
-        }),
-        dict({
-          'id': 253,
-          'text': ' the',
-        }),
-        dict({
-          'id': 2892,
-          'text': ' history',
-        }),
-        dict({
-          'id': 3212,
-          'text': ' behind',
-        }),
-        dict({
-          'id': 436,
-          'text': ' this',
-        }),
-        dict({
-          'id': 3159,
-          'text': ' word',
-        }),
-        dict({
-          'id': 32,
-          'text': '?',
-        }),
-        dict({
-          'id': 0,
-          'text': '<|endoftext|>',
-        }),
-        dict({
-          'id': 50281,
-          'text': '<|assistant|>',
-        }),
-      ]),
-      'seed': None,
-      'tokens': list([
-        dict({
-          'id': 510,
-          'special': False,
-          'text': 'The',
-        }),
-        dict({
-          'id': 3159,
-          'special': False,
-          'text': ' word',
-        }),
-        dict({
-          'id': 346,
-          'special': False,
-          'text': ' "',
-        }),
-        dict({
-          'id': 6441,
-          'special': False,
-          'text': 'mem',
-        }),
-        dict({
-          'id': 70,
-          'special': False,
-          'text': 'e',
-        }),
-        dict({
-          'id': 3,
-          'special': False,
-          'text': '"',
-        }),
-        dict({
-          'id': 369,
-          'special': False,
-          'text': ' was',
-        }),
-        dict({
-          'id': 806,
-          'special': False,
-          'text': ' first',
-        }),
-        dict({
-          'id': 908,
-          'special': False,
-          'text': ' used',
-        }),
-        dict({
-          'id': 275,
-          'special': False,
-          'text': ' in',
-        }),
-      ]),
-    }),
-    'generated_text': 'The word "meme" was first used in',
-  })
-# ---
-# name: test_flash_neox_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 50278,
-            'text': '<|prompter|>',
-          }),
-          dict({
-            'id': 1276,
-            'text': 'What',
-          }),
-          dict({
-            'id': 310,
-            'text': ' is',
-          }),
-          dict({
-            'id': 247,
-            'text': ' a',
-          }),
-          dict({
-            'id': 1167,
-            'text': ' mem',
-          }),
-          dict({
-            'id': 70,
-            'text': 'e',
-          }),
-          dict({
-            'id': 13,
-            'text': ',',
-          }),
-          dict({
-            'id': 285,
-            'text': ' and',
-          }),
-          dict({
-            'id': 752,
-            'text': ' what',
-          }),
-          dict({
-            'id': 434,
-            'text': "'s",
-          }),
-          dict({
-            'id': 253,
-            'text': ' the',
-          }),
-          dict({
-            'id': 2892,
-            'text': ' history',
-          }),
-          dict({
-            'id': 3212,
-            'text': ' behind',
-          }),
-          dict({
-            'id': 436,
-            'text': ' this',
-          }),
-          dict({
-            'id': 3159,
-            'text': ' word',
-          }),
-          dict({
-            'id': 32,
-            'text': '?',
-          }),
-          dict({
-            'id': 0,
-            'text': '<|endoftext|>',
-          }),
-          dict({
-            'id': 50281,
-            'text': '<|assistant|>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 510,
-            'special': False,
-            'text': 'The',
-          }),
-          dict({
-            'id': 3159,
-            'special': False,
-            'text': ' word',
-          }),
-          dict({
-            'id': 346,
-            'special': False,
-            'text': ' "',
-          }),
-          dict({
-            'id': 6441,
-            'special': False,
-            'text': 'mem',
-          }),
-          dict({
-            'id': 70,
-            'special': False,
-            'text': 'e',
-          }),
-          dict({
-            'id': 3,
-            'special': False,
-            'text': '"',
-          }),
-          dict({
-            'id': 369,
-            'special': False,
-            'text': ' was',
-          }),
-          dict({
-            'id': 806,
-            'special': False,
-            'text': ' first',
-          }),
-          dict({
-            'id': 908,
-            'special': False,
-            'text': ' used',
-          }),
-          dict({
-            'id': 275,
-            'special': False,
-            'text': ' in',
-          }),
-        ]),
-      }),
-      'generated_text': 'The word "meme" was first used in',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 50278,
-            'text': '<|prompter|>',
-          }),
-          dict({
-            'id': 1276,
-            'text': 'What',
-          }),
-          dict({
-            'id': 310,
-            'text': ' is',
-          }),
-          dict({
-            'id': 247,
-            'text': ' a',
-          }),
-          dict({
-            'id': 1167,
-            'text': ' mem',
-          }),
-          dict({
-            'id': 70,
-            'text': 'e',
-          }),
-          dict({
-            'id': 13,
-            'text': ',',
-          }),
-          dict({
-            'id': 285,
-            'text': ' and',
-          }),
-          dict({
-            'id': 752,
-            'text': ' what',
-          }),
-          dict({
-            'id': 434,
-            'text': "'s",
-          }),
-          dict({
-            'id': 253,
-            'text': ' the',
-          }),
-          dict({
-            'id': 2892,
-            'text': ' history',
-          }),
-          dict({
-            'id': 3212,
-            'text': ' behind',
-          }),
-          dict({
-            'id': 436,
-            'text': ' this',
-          }),
-          dict({
-            'id': 3159,
-            'text': ' word',
-          }),
-          dict({
-            'id': 32,
-            'text': '?',
-          }),
-          dict({
-            'id': 0,
-            'text': '<|endoftext|>',
-          }),
-          dict({
-            'id': 50281,
-            'text': '<|assistant|>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 510,
-            'special': False,
-            'text': 'The',
-          }),
-          dict({
-            'id': 3159,
-            'special': False,
-            'text': ' word',
-          }),
-          dict({
-            'id': 346,
-            'special': False,
-            'text': ' "',
-          }),
-          dict({
-            'id': 6441,
-            'special': False,
-            'text': 'mem',
-          }),
-          dict({
-            'id': 70,
-            'special': False,
-            'text': 'e',
-          }),
-          dict({
-            'id': 3,
-            'special': False,
-            'text': '"',
-          }),
-          dict({
-            'id': 369,
-            'special': False,
-            'text': ' was',
-          }),
-          dict({
-            'id': 806,
-            'special': False,
-            'text': ' first',
-          }),
-          dict({
-            'id': 908,
-            'special': False,
-            'text': ' used',
-          }),
-          dict({
-            'id': 275,
-            'special': False,
-            'text': ' in',
-          }),
-        ]),
-      }),
-      'generated_text': 'The word "meme" was first used in',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 50278,
-            'text': '<|prompter|>',
-          }),
-          dict({
-            'id': 1276,
-            'text': 'What',
-          }),
-          dict({
-            'id': 310,
-            'text': ' is',
-          }),
-          dict({
-            'id': 247,
-            'text': ' a',
-          }),
-          dict({
-            'id': 1167,
-            'text': ' mem',
-          }),
-          dict({
-            'id': 70,
-            'text': 'e',
-          }),
-          dict({
-            'id': 13,
-            'text': ',',
-          }),
-          dict({
-            'id': 285,
-            'text': ' and',
-          }),
-          dict({
-            'id': 752,
-            'text': ' what',
-          }),
-          dict({
-            'id': 434,
-            'text': "'s",
-          }),
-          dict({
-            'id': 253,
-            'text': ' the',
-          }),
-          dict({
-            'id': 2892,
-            'text': ' history',
-          }),
-          dict({
-            'id': 3212,
-            'text': ' behind',
-          }),
-          dict({
-            'id': 436,
-            'text': ' this',
-          }),
-          dict({
-            'id': 3159,
-            'text': ' word',
-          }),
-          dict({
-            'id': 32,
-            'text': '?',
-          }),
-          dict({
-            'id': 0,
-            'text': '<|endoftext|>',
-          }),
-          dict({
-            'id': 50281,
-            'text': '<|assistant|>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 510,
-            'special': False,
-            'text': 'The',
-          }),
-          dict({
-            'id': 3159,
-            'special': False,
-            'text': ' word',
-          }),
-          dict({
-            'id': 346,
-            'special': False,
-            'text': ' "',
-          }),
-          dict({
-            'id': 6441,
-            'special': False,
-            'text': 'mem',
-          }),
-          dict({
-            'id': 70,
-            'special': False,
-            'text': 'e',
-          }),
-          dict({
-            'id': 3,
-            'special': False,
-            'text': '"',
-          }),
-          dict({
-            'id': 369,
-            'special': False,
-            'text': ' was',
-          }),
-          dict({
-            'id': 806,
-            'special': False,
-            'text': ' first',
-          }),
-          dict({
-            'id': 908,
-            'special': False,
-            'text': ' used',
-          }),
-          dict({
-            'id': 275,
-            'special': False,
-            'text': ' in',
-          }),
-        ]),
-      }),
-      'generated_text': 'The word "meme" was first used in',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 50278,
-            'text': '<|prompter|>',
-          }),
-          dict({
-            'id': 1276,
-            'text': 'What',
-          }),
-          dict({
-            'id': 310,
-            'text': ' is',
-          }),
-          dict({
-            'id': 247,
-            'text': ' a',
-          }),
-          dict({
-            'id': 1167,
-            'text': ' mem',
-          }),
-          dict({
-            'id': 70,
-            'text': 'e',
-          }),
-          dict({
-            'id': 13,
-            'text': ',',
-          }),
-          dict({
-            'id': 285,
-            'text': ' and',
-          }),
-          dict({
-            'id': 752,
-            'text': ' what',
-          }),
-          dict({
-            'id': 434,
-            'text': "'s",
-          }),
-          dict({
-            'id': 253,
-            'text': ' the',
-          }),
-          dict({
-            'id': 2892,
-            'text': ' history',
-          }),
-          dict({
-            'id': 3212,
-            'text': ' behind',
-          }),
-          dict({
-            'id': 436,
-            'text': ' this',
-          }),
-          dict({
-            'id': 3159,
-            'text': ' word',
-          }),
-          dict({
-            'id': 32,
-            'text': '?',
-          }),
-          dict({
-            'id': 0,
-            'text': '<|endoftext|>',
-          }),
-          dict({
-            'id': 50281,
-            'text': '<|assistant|>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 510,
-            'special': False,
-            'text': 'The',
-          }),
-          dict({
-            'id': 3159,
-            'special': False,
-            'text': ' word',
-          }),
-          dict({
-            'id': 346,
-            'special': False,
-            'text': ' "',
-          }),
-          dict({
-            'id': 6441,
-            'special': False,
-            'text': 'mem',
-          }),
-          dict({
-            'id': 70,
-            'special': False,
-            'text': 'e',
-          }),
-          dict({
-            'id': 3,
-            'special': False,
-            'text': '"',
-          }),
-          dict({
-            'id': 369,
-            'special': False,
-            'text': ' was',
-          }),
-          dict({
-            'id': 806,
-            'special': False,
-            'text': ' first',
-          }),
-          dict({
-            'id': 908,
-            'special': False,
-            'text': ' used',
-          }),
-          dict({
-            'id': 275,
-            'special': False,
-            'text': ' in',
-          }),
-        ]),
-      }),
-      'generated_text': 'The word "meme" was first used in',
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox.json b/integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox.json
new file mode 100644
index 00000000..787704ce
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox.json
@@ -0,0 +1,163 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 50278,
+        "logprob": null,
+        "text": "<|prompter|>"
+      },
+      {
+        "id": 1276,
+        "logprob": -8.03125,
+        "text": "What"
+      },
+      {
+        "id": 310,
+        "logprob": -5.421875,
+        "text": " is"
+      },
+      {
+        "id": 247,
+        "logprob": -2.1601562,
+        "text": " a"
+      },
+      {
+        "id": 1167,
+        "logprob": -5.4609375,
+        "text": " mem"
+      },
+      {
+        "id": 70,
+        "logprob": -0.005657196,
+        "text": "e"
+      },
+      {
+        "id": 13,
+        "logprob": -7.28125,
+        "text": ","
+      },
+      {
+        "id": 285,
+        "logprob": -0.2980957,
+        "text": " and"
+      },
+      {
+        "id": 752,
+        "logprob": -2.1679688,
+        "text": " what"
+      },
+      {
+        "id": 434,
+        "logprob": -5.6210938,
+        "text": "'s"
+      },
+      {
+        "id": 253,
+        "logprob": -0.81103516,
+        "text": " the"
+      },
+      {
+        "id": 2892,
+        "logprob": -6.6640625,
+        "text": " history"
+      },
+      {
+        "id": 3212,
+        "logprob": -2.265625,
+        "text": " behind"
+      },
+      {
+        "id": 436,
+        "logprob": -11.5078125,
+        "text": " this"
+      },
+      {
+        "id": 3159,
+        "logprob": -2.1582031,
+        "text": " word"
+      },
+      {
+        "id": 32,
+        "logprob": -0.008720398,
+        "text": "?"
+      },
+      {
+        "id": 0,
+        "logprob": -2.4726562,
+        "text": "<|endoftext|>"
+      },
+      {
+        "id": 50281,
+        "logprob": -18.265625,
+        "text": "<|assistant|>"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 510,
+        "logprob": -0.63183594,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 3159,
+        "logprob": -0.5390625,
+        "special": false,
+        "text": " word"
+      },
+      {
+        "id": 346,
+        "logprob": -0.045684814,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 6441,
+        "logprob": -0.002090454,
+        "special": false,
+        "text": "mem"
+      },
+      {
+        "id": 70,
+        "logprob": -1.3589859e-05,
+        "special": false,
+        "text": "e"
+      },
+      {
+        "id": 3,
+        "logprob": -0.0009455681,
+        "special": false,
+        "text": "\""
+      },
+      {
+        "id": 369,
+        "logprob": -0.088012695,
+        "special": false,
+        "text": " was"
+      },
+      {
+        "id": 806,
+        "logprob": -0.12585449,
+        "special": false,
+        "text": " first"
+      },
+      {
+        "id": 908,
+        "logprob": -0.017196655,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 275,
+        "logprob": -0.49731445,
+        "special": false,
+        "text": " in"
+      }
+    ]
+  },
+  "generated_text": "The word \"meme\" was first used in"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox_load.json b/integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox_load.json
new file mode 100644
index 00000000..47d6a77e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox_load.json
@@ -0,0 +1,654 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 50278,
+          "logprob": null,
+          "text": "<|prompter|>"
+        },
+        {
+          "id": 1276,
+          "logprob": -8.03125,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -5.421875,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -2.1601562,
+          "text": " a"
+        },
+        {
+          "id": 1167,
+          "logprob": -5.4609375,
+          "text": " mem"
+        },
+        {
+          "id": 70,
+          "logprob": -0.005657196,
+          "text": "e"
+        },
+        {
+          "id": 13,
+          "logprob": -7.28125,
+          "text": ","
+        },
+        {
+          "id": 285,
+          "logprob": -0.2980957,
+          "text": " and"
+        },
+        {
+          "id": 752,
+          "logprob": -2.1679688,
+          "text": " what"
+        },
+        {
+          "id": 434,
+          "logprob": -5.6210938,
+          "text": "'s"
+        },
+        {
+          "id": 253,
+          "logprob": -0.81103516,
+          "text": " the"
+        },
+        {
+          "id": 2892,
+          "logprob": -6.6640625,
+          "text": " history"
+        },
+        {
+          "id": 3212,
+          "logprob": -2.265625,
+          "text": " behind"
+        },
+        {
+          "id": 436,
+          "logprob": -11.5078125,
+          "text": " this"
+        },
+        {
+          "id": 3159,
+          "logprob": -2.1582031,
+          "text": " word"
+        },
+        {
+          "id": 32,
+          "logprob": -0.008720398,
+          "text": "?"
+        },
+        {
+          "id": 0,
+          "logprob": -2.4726562,
+          "text": "<|endoftext|>"
+        },
+        {
+          "id": 50281,
+          "logprob": -18.265625,
+          "text": "<|assistant|>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 510,
+          "logprob": -0.63183594,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 3159,
+          "logprob": -0.5488281,
+          "special": false,
+          "text": " word"
+        },
+        {
+          "id": 346,
+          "logprob": -0.045684814,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 6441,
+          "logprob": -0.00207901,
+          "special": false,
+          "text": "mem"
+        },
+        {
+          "id": 70,
+          "logprob": -1.335144e-05,
+          "special": false,
+          "text": "e"
+        },
+        {
+          "id": 3,
+          "logprob": -0.00097227097,
+          "special": false,
+          "text": "\""
+        },
+        {
+          "id": 369,
+          "logprob": -0.0892334,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 806,
+          "logprob": -0.12463379,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 908,
+          "logprob": -0.01737976,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 275,
+          "logprob": -0.50341797,
+          "special": false,
+          "text": " in"
+        }
+      ]
+    },
+    "generated_text": "The word \"meme\" was first used in"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_santacoder.ambr b/integration-tests/models/__snapshots__/test_flash_santacoder.ambr
deleted file mode 100644
index 030820cb..00000000
--- a/integration-tests/models/__snapshots__/test_flash_santacoder.ambr
+++ /dev/null
@@ -1,472 +0,0 @@
-# serializer version: 1
-# name: test_flash_santacoder
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 563,
-          'text': 'def',
-        }),
-        dict({
-          'id': 942,
-          'text': ' print',
-        }),
-        dict({
-          'id': 62,
-          'text': '_',
-        }),
-        dict({
-          'id': 7196,
-          'text': 'hello',
-        }),
-      ]),
-      'seed': None,
-      'tokens': list([
-        dict({
-          'id': 1241,
-          'special': False,
-          'text': '():',
-        }),
-        dict({
-          'id': 258,
-          'special': False,
-          'text': '''
-            
-               
-          ''',
-        }),
-        dict({
-          'id': 942,
-          'special': False,
-          'text': ' print',
-        }),
-        dict({
-          'id': 372,
-          'special': False,
-          'text': '("',
-        }),
-        dict({
-          'id': 7371,
-          'special': False,
-          'text': 'Hello',
-        }),
-        dict({
-          'id': 9956,
-          'special': False,
-          'text': ' World',
-        }),
-        dict({
-          'id': 8657,
-          'special': False,
-          'text': '!")',
-        }),
-        dict({
-          'id': 185,
-          'special': False,
-          'text': '''
-            
-  
-          ''',
-        }),
-        dict({
-          'id': 185,
-          'special': False,
-          'text': '''
-            
-  
-          ''',
-        }),
-        dict({
-          'id': 1018,
-          'special': False,
-          'text': 'print',
-        }),
-      ]),
-    }),
-    'generated_text': '''
-      ():
-          print("Hello World!")
-      
-      print
-    ''',
-  })
-# ---
-# name: test_flash_santacoder_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 563,
-            'text': 'def',
-          }),
-          dict({
-            'id': 942,
-            'text': ' print',
-          }),
-          dict({
-            'id': 62,
-            'text': '_',
-          }),
-          dict({
-            'id': 7196,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 1241,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 258,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 942,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 372,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 7371,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 9956,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 8657,
-            'special': False,
-            'text': '!")',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 1018,
-            'special': False,
-            'text': 'print',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World!")
-        
-        print
-      ''',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 563,
-            'text': 'def',
-          }),
-          dict({
-            'id': 942,
-            'text': ' print',
-          }),
-          dict({
-            'id': 62,
-            'text': '_',
-          }),
-          dict({
-            'id': 7196,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 1241,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 258,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 942,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 372,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 7371,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 9956,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 8657,
-            'special': False,
-            'text': '!")',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 1018,
-            'special': False,
-            'text': 'print',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World!")
-        
-        print
-      ''',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 563,
-            'text': 'def',
-          }),
-          dict({
-            'id': 942,
-            'text': ' print',
-          }),
-          dict({
-            'id': 62,
-            'text': '_',
-          }),
-          dict({
-            'id': 7196,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 1241,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 258,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 942,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 372,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 7371,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 9956,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 8657,
-            'special': False,
-            'text': '!")',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 1018,
-            'special': False,
-            'text': 'print',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World!")
-        
-        print
-      ''',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 563,
-            'text': 'def',
-          }),
-          dict({
-            'id': 942,
-            'text': ' print',
-          }),
-          dict({
-            'id': 62,
-            'text': '_',
-          }),
-          dict({
-            'id': 7196,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 1241,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 258,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 942,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 372,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 7371,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 9956,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 8657,
-            'special': False,
-            'text': '!")',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 185,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 1018,
-            'special': False,
-            'text': 'print',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World!")
-        
-        print
-      ''',
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder.json b/integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder.json
new file mode 100644
index 00000000..0293e35a
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder.json
@@ -0,0 +1,93 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 563,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 942,
+        "logprob": -5.1367188,
+        "text": " print"
+      },
+      {
+        "id": 62,
+        "logprob": -0.24450684,
+        "text": "_"
+      },
+      {
+        "id": 7196,
+        "logprob": -6.9609375,
+        "text": "hello"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1241,
+        "logprob": -0.9863281,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 258,
+        "logprob": -0.21447754,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 942,
+        "logprob": -0.43701172,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 372,
+        "logprob": -0.5361328,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 7371,
+        "logprob": -0.44555664,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 9956,
+        "logprob": -1.2412109,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 8657,
+        "logprob": -0.7583008,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 185,
+        "logprob": -0.76171875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 185,
+        "logprob": -0.20837402,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 1018,
+        "logprob": -1.2470703,
+        "special": false,
+        "text": "print"
+      }
+    ]
+  },
+  "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder_load.json b/integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder_load.json
new file mode 100644
index 00000000..a03580b3
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder_load.json
@@ -0,0 +1,374 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 563,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 942,
+          "logprob": -5.1367188,
+          "text": " print"
+        },
+        {
+          "id": 62,
+          "logprob": -0.24450684,
+          "text": "_"
+        },
+        {
+          "id": 7196,
+          "logprob": -6.9609375,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1241,
+          "logprob": -0.9863281,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 258,
+          "logprob": -0.21362305,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 942,
+          "logprob": -0.44360352,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 372,
+          "logprob": -0.54248047,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 7371,
+          "logprob": -0.44555664,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 9956,
+          "logprob": -1.2441406,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 8657,
+          "logprob": -0.75878906,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 185,
+          "logprob": -0.76171875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 185,
+          "logprob": -0.2084961,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 1018,
+          "logprob": -1.2460938,
+          "special": false,
+          "text": "print"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 563,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 942,
+          "logprob": -5.1367188,
+          "text": " print"
+        },
+        {
+          "id": 62,
+          "logprob": -0.24450684,
+          "text": "_"
+        },
+        {
+          "id": 7196,
+          "logprob": -6.9609375,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1241,
+          "logprob": -0.9863281,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 258,
+          "logprob": -0.21362305,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 942,
+          "logprob": -0.44360352,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 372,
+          "logprob": -0.54248047,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 7371,
+          "logprob": -0.44555664,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 9956,
+          "logprob": -1.2441406,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 8657,
+          "logprob": -0.75878906,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 185,
+          "logprob": -0.76171875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 185,
+          "logprob": -0.2084961,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 1018,
+          "logprob": -1.2460938,
+          "special": false,
+          "text": "print"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 563,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 942,
+          "logprob": -5.1367188,
+          "text": " print"
+        },
+        {
+          "id": 62,
+          "logprob": -0.24450684,
+          "text": "_"
+        },
+        {
+          "id": 7196,
+          "logprob": -6.9609375,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1241,
+          "logprob": -0.9863281,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 258,
+          "logprob": -0.21362305,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 942,
+          "logprob": -0.44360352,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 372,
+          "logprob": -0.54248047,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 7371,
+          "logprob": -0.44555664,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 9956,
+          "logprob": -1.2441406,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 8657,
+          "logprob": -0.75878906,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 185,
+          "logprob": -0.76171875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 185,
+          "logprob": -0.2084961,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 1018,
+          "logprob": -1.2460938,
+          "special": false,
+          "text": "print"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 563,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 942,
+          "logprob": -5.1367188,
+          "text": " print"
+        },
+        {
+          "id": 62,
+          "logprob": -0.24450684,
+          "text": "_"
+        },
+        {
+          "id": 7196,
+          "logprob": -6.9609375,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1241,
+          "logprob": -0.9863281,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 258,
+          "logprob": -0.21362305,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 942,
+          "logprob": -0.44360352,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 372,
+          "logprob": -0.54248047,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 7371,
+          "logprob": -0.44555664,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 9956,
+          "logprob": -1.2441406,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 8657,
+          "logprob": -0.75878906,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 185,
+          "logprob": -0.76171875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 185,
+          "logprob": -0.2084961,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 1018,
+          "logprob": -1.2460938,
+          "special": false,
+          "text": "print"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder.ambr b/integration-tests/models/__snapshots__/test_flash_starcoder.ambr
deleted file mode 100644
index e0f4b568..00000000
--- a/integration-tests/models/__snapshots__/test_flash_starcoder.ambr
+++ /dev/null
@@ -1,573 +0,0 @@
-# serializer version: 1
-# name: test_flash_starcoder
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 589,
-          'text': 'def',
-        }),
-        dict({
-          'id': 1459,
-          'text': ' print',
-        }),
-        dict({
-          'id': 81,
-          'text': '_',
-        }),
-        dict({
-          'id': 7656,
-          'text': 'hello',
-        }),
-      ]),
-      'seed': None,
-      'tokens': list([
-        dict({
-          'id': 2262,
-          'special': False,
-          'text': '():',
-        }),
-        dict({
-          'id': 284,
-          'special': False,
-          'text': '''
-            
-               
-          ''',
-        }),
-        dict({
-          'id': 1459,
-          'special': False,
-          'text': ' print',
-        }),
-        dict({
-          'id': 440,
-          'special': False,
-          'text': '("',
-        }),
-        dict({
-          'id': 8279,
-          'special': False,
-          'text': 'Hello',
-        }),
-        dict({
-          'id': 10896,
-          'special': False,
-          'text': ' World',
-        }),
-        dict({
-          'id': 657,
-          'special': False,
-          'text': '")',
-        }),
-        dict({
-          'id': 203,
-          'special': False,
-          'text': '''
-            
-  
-          ''',
-        }),
-        dict({
-          'id': 203,
-          'special': False,
-          'text': '''
-            
-  
-          ''',
-        }),
-        dict({
-          'id': 589,
-          'special': False,
-          'text': 'def',
-        }),
-      ]),
-    }),
-    'generated_text': '''
-      ():
-          print("Hello World")
-      
-      def
-    ''',
-  })
-# ---
-# name: test_flash_starcoder_default_params
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
-      'generated_tokens': 12,
-      'prefill': list([
-        dict({
-          'id': 589,
-          'text': 'def',
-        }),
-        dict({
-          'id': 1459,
-          'text': ' print',
-        }),
-        dict({
-          'id': 81,
-          'text': '_',
-        }),
-        dict({
-          'id': 7656,
-          'text': 'hello',
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 2262,
-          'special': False,
-          'text': '():',
-        }),
-        dict({
-          'id': 284,
-          'special': False,
-          'text': '''
-            
-               
-          ''',
-        }),
-        dict({
-          'id': 5741,
-          'special': False,
-          'text': ' logging',
-        }),
-        dict({
-          'id': 32,
-          'special': False,
-          'text': '.',
-        }),
-        dict({
-          'id': 1338,
-          'special': False,
-          'text': 'info',
-        }),
-        dict({
-          'id': 463,
-          'special': False,
-          'text': "('",
-        }),
-        dict({
-          'id': 8279,
-          'special': False,
-          'text': 'Hello',
-        }),
-        dict({
-          'id': 30,
-          'special': False,
-          'text': ',',
-        }),
-        dict({
-          'id': 10896,
-          'special': False,
-          'text': ' World',
-        }),
-        dict({
-          'id': 683,
-          'special': False,
-          'text': "')",
-        }),
-        dict({
-          'id': 203,
-          'special': False,
-          'text': '''
-            
-  
-          ''',
-        }),
-        dict({
-          'id': 0,
-          'special': True,
-          'text': '<|endoftext|>',
-        }),
-      ]),
-    }),
-    'generated_text': '''
-      ():
-          logging.info('Hello, World')
-      <|endoftext|>
-    ''',
-  })
-# ---
-# name: test_flash_starcoder_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 589,
-            'text': 'def',
-          }),
-          dict({
-            'id': 1459,
-            'text': ' print',
-          }),
-          dict({
-            'id': 81,
-            'text': '_',
-          }),
-          dict({
-            'id': 7656,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 2262,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 284,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 1459,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 440,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 8279,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 10896,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 657,
-            'special': False,
-            'text': '")',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 589,
-            'special': False,
-            'text': 'def',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World")
-        
-        def
-      ''',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 589,
-            'text': 'def',
-          }),
-          dict({
-            'id': 1459,
-            'text': ' print',
-          }),
-          dict({
-            'id': 81,
-            'text': '_',
-          }),
-          dict({
-            'id': 7656,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 2262,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 284,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 1459,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 440,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 8279,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 10896,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 657,
-            'special': False,
-            'text': '")',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 589,
-            'special': False,
-            'text': 'def',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World")
-        
-        def
-      ''',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 589,
-            'text': 'def',
-          }),
-          dict({
-            'id': 1459,
-            'text': ' print',
-          }),
-          dict({
-            'id': 81,
-            'text': '_',
-          }),
-          dict({
-            'id': 7656,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 2262,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 284,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 1459,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 440,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 8279,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 10896,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 657,
-            'special': False,
-            'text': '")',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 589,
-            'special': False,
-            'text': 'def',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World")
-        
-        def
-      ''',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.Length: 'length'>,
-        'generated_tokens': 10,
-        'prefill': list([
-          dict({
-            'id': 589,
-            'text': 'def',
-          }),
-          dict({
-            'id': 1459,
-            'text': ' print',
-          }),
-          dict({
-            'id': 81,
-            'text': '_',
-          }),
-          dict({
-            'id': 7656,
-            'text': 'hello',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 2262,
-            'special': False,
-            'text': '():',
-          }),
-          dict({
-            'id': 284,
-            'special': False,
-            'text': '''
-              
-                 
-            ''',
-          }),
-          dict({
-            'id': 1459,
-            'special': False,
-            'text': ' print',
-          }),
-          dict({
-            'id': 440,
-            'special': False,
-            'text': '("',
-          }),
-          dict({
-            'id': 8279,
-            'special': False,
-            'text': 'Hello',
-          }),
-          dict({
-            'id': 10896,
-            'special': False,
-            'text': ' World',
-          }),
-          dict({
-            'id': 657,
-            'special': False,
-            'text': '")',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 203,
-            'special': False,
-            'text': '''
-              
-  
-            ''',
-          }),
-          dict({
-            'id': 589,
-            'special': False,
-            'text': 'def',
-          }),
-        ]),
-      }),
-      'generated_text': '''
-        ():
-            print("Hello World")
-        
-        def
-      ''',
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder.json b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder.json
new file mode 100644
index 00000000..8505c1db
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder.json
@@ -0,0 +1,93 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 589,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 1459,
+        "logprob": -5.6289062,
+        "text": " print"
+      },
+      {
+        "id": 81,
+        "logprob": -1.6005859,
+        "text": "_"
+      },
+      {
+        "id": 7656,
+        "logprob": -5.9921875,
+        "text": "hello"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2262,
+        "logprob": -0.7705078,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 284,
+        "logprob": -0.2590332,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1459,
+        "logprob": -0.39379883,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 440,
+        "logprob": -0.61376953,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8279,
+        "logprob": -0.47338867,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10896,
+        "logprob": -1.5068359,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 657,
+        "logprob": -0.80810547,
+        "special": false,
+        "text": "\")"
+      },
+      {
+        "id": 203,
+        "logprob": -0.7397461,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 203,
+        "logprob": -0.35229492,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 589,
+        "logprob": -1.0371094,
+        "special": false,
+        "text": "def"
+      }
+    ]
+  },
+  "generated_text": "():\n    print(\"Hello World\")\n\ndef"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
new file mode 100644
index 00000000..21bb509b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
@@ -0,0 +1,105 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 12,
+    "prefill": [
+      {
+        "id": 589,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 1459,
+        "logprob": -5.6289062,
+        "text": " print"
+      },
+      {
+        "id": 81,
+        "logprob": -1.6005859,
+        "text": "_"
+      },
+      {
+        "id": 7656,
+        "logprob": -5.9921875,
+        "text": "hello"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 2262,
+        "logprob": -0.7451172,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 284,
+        "logprob": -0.21325684,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 5741,
+        "logprob": -5.734375,
+        "special": false,
+        "text": " logging"
+      },
+      {
+        "id": 32,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1338,
+        "logprob": -0.3232422,
+        "special": false,
+        "text": "info"
+      },
+      {
+        "id": 463,
+        "logprob": -1.0380859,
+        "special": false,
+        "text": "('"
+      },
+      {
+        "id": 8279,
+        "logprob": -0.8378906,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 30,
+        "logprob": -1.9501953,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 10896,
+        "logprob": -1.3476562,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 683,
+        "logprob": -1.796875,
+        "special": false,
+        "text": "')"
+      },
+      {
+        "id": 203,
+        "logprob": -0.9873047,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 0,
+        "logprob": -0.7495117,
+        "special": true,
+        "text": "<|endoftext|>"
+      }
+    ]
+  },
+  "generated_text": "():\n    logging.info('Hello, World')\n<|endoftext|>"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_load.json b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_load.json
new file mode 100644
index 00000000..0b3ad554
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_load.json
@@ -0,0 +1,374 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1459,
+          "logprob": -5.6289062,
+          "text": " print"
+        },
+        {
+          "id": 81,
+          "logprob": -1.6005859,
+          "text": "_"
+        },
+        {
+          "id": 7656,
+          "logprob": -5.9921875,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2262,
+          "logprob": -0.7705078,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 284,
+          "logprob": -0.2602539,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1459,
+          "logprob": -0.39282227,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 440,
+          "logprob": -0.6113281,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8279,
+          "logprob": -0.4765625,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10896,
+          "logprob": -1.5068359,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 657,
+          "logprob": -0.8154297,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 203,
+          "logprob": -0.7319336,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 203,
+          "logprob": -0.35229492,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 589,
+          "logprob": -1.0380859,
+          "special": false,
+          "text": "def"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1459,
+          "logprob": -5.6289062,
+          "text": " print"
+        },
+        {
+          "id": 81,
+          "logprob": -1.6005859,
+          "text": "_"
+        },
+        {
+          "id": 7656,
+          "logprob": -5.9921875,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2262,
+          "logprob": -0.7705078,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 284,
+          "logprob": -0.2602539,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1459,
+          "logprob": -0.39282227,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 440,
+          "logprob": -0.6113281,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8279,
+          "logprob": -0.4765625,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10896,
+          "logprob": -1.5068359,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 657,
+          "logprob": -0.8154297,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 203,
+          "logprob": -0.7319336,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 203,
+          "logprob": -0.35229492,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 589,
+          "logprob": -1.0380859,
+          "special": false,
+          "text": "def"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1459,
+          "logprob": -5.6289062,
+          "text": " print"
+        },
+        {
+          "id": 81,
+          "logprob": -1.6005859,
+          "text": "_"
+        },
+        {
+          "id": 7656,
+          "logprob": -5.9921875,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2262,
+          "logprob": -0.7705078,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 284,
+          "logprob": -0.2602539,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1459,
+          "logprob": -0.39282227,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 440,
+          "logprob": -0.6113281,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8279,
+          "logprob": -0.4765625,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10896,
+          "logprob": -1.5068359,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 657,
+          "logprob": -0.8154297,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 203,
+          "logprob": -0.7319336,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 203,
+          "logprob": -0.35229492,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 589,
+          "logprob": -1.0380859,
+          "special": false,
+          "text": "def"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 589,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1459,
+          "logprob": -5.6289062,
+          "text": " print"
+        },
+        {
+          "id": 81,
+          "logprob": -1.6005859,
+          "text": "_"
+        },
+        {
+          "id": 7656,
+          "logprob": -5.9921875,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2262,
+          "logprob": -0.7705078,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 284,
+          "logprob": -0.2602539,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1459,
+          "logprob": -0.39282227,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 440,
+          "logprob": -0.6113281,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8279,
+          "logprob": -0.4765625,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10896,
+          "logprob": -1.5068359,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 657,
+          "logprob": -0.8154297,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 203,
+          "logprob": -0.7319336,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 203,
+          "logprob": -0.35229492,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 589,
+          "logprob": -1.0380859,
+          "special": false,
+          "text": "def"
+        }
+      ]
+    },
+    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_mt0_base.ambr b/integration-tests/models/__snapshots__/test_mt0_base.ambr
deleted file mode 100644
index d7c6eaf6..00000000
--- a/integration-tests/models/__snapshots__/test_mt0_base.ambr
+++ /dev/null
@@ -1,306 +0,0 @@
-# serializer version: 1
-# name: test_mt0_base
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
-      'generated_tokens': 5,
-      'prefill': list([
-        dict({
-          'id': 0,
-          'text': '<pad>',
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 926,
-          'special': False,
-          'text': 'To',
-        }),
-        dict({
-          'id': 18295,
-          'special': False,
-          'text': ' sell',
-        }),
-        dict({
-          'id': 7868,
-          'special': False,
-          'text': ' things',
-        }),
-        dict({
-          'id': 260,
-          'special': False,
-          'text': '.',
-        }),
-        dict({
-          'id': 1,
-          'special': True,
-          'text': '</s>',
-        }),
-      ]),
-    }),
-    'generated_text': 'To sell things.',
-  })
-# ---
-# name: test_mt0_base_all_params
-  dict({
-    'details': dict({
-      'best_of_sequences': None,
-      'finish_reason': <FinishReason.Length: 'length'>,
-      'generated_tokens': 10,
-      'prefill': list([
-        dict({
-          'id': 0,
-          'text': '<pad>',
-        }),
-      ]),
-      'seed': 0,
-      'tokens': list([
-        dict({
-          'id': 16017,
-          'special': False,
-          'text': 'blue',
-        }),
-        dict({
-          'id': 20495,
-          'special': False,
-          'text': ' sky',
-        }),
-        dict({
-          'id': 259,
-          'special': False,
-          'text': ' ',
-        }),
-        dict({
-          'id': 15484,
-          'special': False,
-          'text': 'appear',
-        }),
-        dict({
-          'id': 345,
-          'special': False,
-          'text': 'ed',
-        }),
-        dict({
-          'id': 288,
-          'special': False,
-          'text': ' to',
-        }),
-        dict({
-          'id': 35622,
-          'special': False,
-          'text': ' cloud',
-        }),
-        dict({
-          'id': 263,
-          'special': False,
-          'text': 's',
-        }),
-        dict({
-          'id': 14701,
-          'special': False,
-          'text': ' above',
-        }),
-        dict({
-          'id': 751,
-          'special': False,
-          'text': ' all',
-        }),
-      ]),
-    }),
-    'generated_text': 'Why is the sky blue?blue sky appeared to clouds above all',
-  })
-# ---
-# name: test_mt0_base_load
-  list([
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
-        'generated_tokens': 6,
-        'prefill': list([
-          dict({
-            'id': 0,
-            'text': '<pad>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 259,
-            'special': False,
-            'text': '',
-          }),
-          dict({
-            'id': 39261,
-            'special': False,
-            'text': 'Because',
-          }),
-          dict({
-            'id': 609,
-            'special': False,
-            'text': ' it',
-          }),
-          dict({
-            'id': 339,
-            'special': False,
-            'text': ' is',
-          }),
-          dict({
-            'id': 16017,
-            'special': False,
-            'text': ' blue',
-          }),
-          dict({
-            'id': 1,
-            'special': True,
-            'text': '</s>',
-          }),
-        ]),
-      }),
-      'generated_text': 'Because it is blue',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
-        'generated_tokens': 6,
-        'prefill': list([
-          dict({
-            'id': 0,
-            'text': '<pad>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 259,
-            'special': False,
-            'text': '',
-          }),
-          dict({
-            'id': 39261,
-            'special': False,
-            'text': 'Because',
-          }),
-          dict({
-            'id': 609,
-            'special': False,
-            'text': ' it',
-          }),
-          dict({
-            'id': 339,
-            'special': False,
-            'text': ' is',
-          }),
-          dict({
-            'id': 16017,
-            'special': False,
-            'text': ' blue',
-          }),
-          dict({
-            'id': 1,
-            'special': True,
-            'text': '</s>',
-          }),
-        ]),
-      }),
-      'generated_text': 'Because it is blue',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
-        'generated_tokens': 6,
-        'prefill': list([
-          dict({
-            'id': 0,
-            'text': '<pad>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 259,
-            'special': False,
-            'text': '',
-          }),
-          dict({
-            'id': 39261,
-            'special': False,
-            'text': 'Because',
-          }),
-          dict({
-            'id': 609,
-            'special': False,
-            'text': ' it',
-          }),
-          dict({
-            'id': 339,
-            'special': False,
-            'text': ' is',
-          }),
-          dict({
-            'id': 16017,
-            'special': False,
-            'text': ' blue',
-          }),
-          dict({
-            'id': 1,
-            'special': True,
-            'text': '</s>',
-          }),
-        ]),
-      }),
-      'generated_text': 'Because it is blue',
-    }),
-    dict({
-      'details': dict({
-        'best_of_sequences': None,
-        'finish_reason': <FinishReason.EndOfSequenceToken: 'eos_token'>,
-        'generated_tokens': 6,
-        'prefill': list([
-          dict({
-            'id': 0,
-            'text': '<pad>',
-          }),
-        ]),
-        'seed': None,
-        'tokens': list([
-          dict({
-            'id': 259,
-            'special': False,
-            'text': '',
-          }),
-          dict({
-            'id': 39261,
-            'special': False,
-            'text': 'Because',
-          }),
-          dict({
-            'id': 609,
-            'special': False,
-            'text': ' it',
-          }),
-          dict({
-            'id': 339,
-            'special': False,
-            'text': ' is',
-          }),
-          dict({
-            'id': 16017,
-            'special': False,
-            'text': ' blue',
-          }),
-          dict({
-            'id': 1,
-            'special': True,
-            'text': '</s>',
-          }),
-        ]),
-      }),
-      'generated_text': 'Because it is blue',
-    }),
-  ])
-# ---
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
new file mode 100644
index 00000000..2a26e3db
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
@@ -0,0 +1,48 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 5,
+    "prefill": [
+      {
+        "id": 0,
+        "logprob": null,
+        "text": "<pad>"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 926,
+        "logprob": -4.3554688,
+        "special": false,
+        "text": "To"
+      },
+      {
+        "id": 18295,
+        "logprob": -7.7734375,
+        "special": false,
+        "text": " sell"
+      },
+      {
+        "id": 7868,
+        "logprob": -3.9257812,
+        "special": false,
+        "text": " things"
+      },
+      {
+        "id": 260,
+        "logprob": -2.4179688,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1,
+        "logprob": 0.0,
+        "special": true,
+        "text": "</s>"
+      }
+    ]
+  },
+  "generated_text": "To sell things."
+}
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
new file mode 100644
index 00000000..fd77252d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -0,0 +1,78 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 0,
+        "logprob": null,
+        "text": "<pad>"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 16017,
+        "logprob": -1.3505859,
+        "special": false,
+        "text": "blue"
+      },
+      {
+        "id": 20495,
+        "logprob": -0.50439453,
+        "special": false,
+        "text": " sky"
+      },
+      {
+        "id": 259,
+        "logprob": -1.2011719,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 15484,
+        "logprob": -2.8378906,
+        "special": false,
+        "text": "appear"
+      },
+      {
+        "id": 345,
+        "logprob": -0.87597656,
+        "special": false,
+        "text": "ed"
+      },
+      {
+        "id": 288,
+        "logprob": -1.8447266,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 35622,
+        "logprob": -7.1445312,
+        "special": false,
+        "text": " cloud"
+      },
+      {
+        "id": 263,
+        "logprob": -1.2929688,
+        "special": false,
+        "text": "s"
+      },
+      {
+        "id": 14701,
+        "logprob": -3.0761719,
+        "special": false,
+        "text": " above"
+      },
+      {
+        "id": 751,
+        "logprob": -4.4375,
+        "special": false,
+        "text": " all"
+      }
+    ]
+  },
+  "generated_text": "Why is the sky blue?blue sky appeared to clouds above all"
+}
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
new file mode 100644
index 00000000..c9e552b6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
@@ -0,0 +1,218 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "eos_token",
+      "generated_tokens": 6,
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 259,
+          "logprob": -1.3789062,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 39261,
+          "logprob": -0.36279297,
+          "special": false,
+          "text": "Because"
+        },
+        {
+          "id": 609,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 339,
+          "logprob": -0.8276367,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 16017,
+          "logprob": -1.6845703,
+          "special": false,
+          "text": " blue"
+        },
+        {
+          "id": 1,
+          "logprob": -0.72753906,
+          "special": true,
+          "text": "</s>"
+        }
+      ]
+    },
+    "generated_text": "Because it is blue"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "eos_token",
+      "generated_tokens": 6,
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 259,
+          "logprob": -1.3798828,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 39261,
+          "logprob": -0.36328125,
+          "special": false,
+          "text": "Because"
+        },
+        {
+          "id": 609,
+          "logprob": -1.0947266,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 339,
+          "logprob": -0.8286133,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 16017,
+          "logprob": -1.6826172,
+          "special": false,
+          "text": " blue"
+        },
+        {
+          "id": 1,
+          "logprob": -0.7290039,
+          "special": true,
+          "text": "</s>"
+        }
+      ]
+    },
+    "generated_text": "Because it is blue"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "eos_token",
+      "generated_tokens": 6,
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 259,
+          "logprob": -1.3789062,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 39261,
+          "logprob": -0.36279297,
+          "special": false,
+          "text": "Because"
+        },
+        {
+          "id": 609,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 339,
+          "logprob": -0.8276367,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 16017,
+          "logprob": -1.6845703,
+          "special": false,
+          "text": " blue"
+        },
+        {
+          "id": 1,
+          "logprob": -0.72753906,
+          "special": true,
+          "text": "</s>"
+        }
+      ]
+    },
+    "generated_text": "Because it is blue"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "eos_token",
+      "generated_tokens": 6,
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 259,
+          "logprob": -1.3789062,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 39261,
+          "logprob": -0.36279297,
+          "special": false,
+          "text": "Because"
+        },
+        {
+          "id": 609,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 339,
+          "logprob": -0.8276367,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 16017,
+          "logprob": -1.6845703,
+          "special": false,
+          "text": " blue"
+        },
+        {
+          "id": 1,
+          "logprob": -0.72753906,
+          "special": true,
+          "text": "</s>"
+        }
+      ]
+    },
+    "generated_text": "Because it is blue"
+  }
+]
diff --git a/integration-tests/models/test_bloom_560m.py b/integration-tests/models/test_bloom_560m.py
index e13606f7..3c598c04 100644
--- a/integration-tests/models/test_bloom_560m.py
+++ b/integration-tests/models/test_bloom_560m.py
@@ -1,18 +1,20 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def bloom_560(launcher):
-    with launcher("bigscience/bloom-560m") as client:
-        yield client
+def bloom_560_handle(launcher):
+    with launcher("bigscience/bloom-560m") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def bloom_560(bloom_560_handle):
+    await bloom_560_handle.health(60)
+    return bloom_560_handle.client
 
 
 @pytest.mark.asyncio
-async def test_bloom_560m(bloom_560, snapshot_test):
-    await health_check(bloom_560, 60)
-
+async def test_bloom_560m(bloom_560, response_snapshot):
     response = await bloom_560.generate(
         "Pour déguster un ortolan, il faut tout d'abord",
         max_new_tokens=10,
@@ -21,13 +23,11 @@ async def test_bloom_560m(bloom_560, snapshot_test):
     )
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
-async def test_bloom_560m_all_params(bloom_560, snapshot_test):
-    await health_check(bloom_560, 60)
-
+async def test_bloom_560m_all_params(bloom_560, response_snapshot):
     response = await bloom_560.generate(
         "Pour déguster un ortolan, il faut tout d'abord",
         max_new_tokens=10,
@@ -44,13 +44,11 @@ async def test_bloom_560m_all_params(bloom_560, snapshot_test):
     )
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
-async def test_bloom_560m_load(bloom_560, generate_load, snapshot_test):
-    await health_check(bloom_560, 60)
-
+async def test_bloom_560m_load(bloom_560, generate_load, response_snapshot):
     responses = await generate_load(
         bloom_560,
         "Pour déguster un ortolan, il faut tout d'abord",
@@ -59,5 +57,6 @@ async def test_bloom_560m_load(bloom_560, generate_load, snapshot_test):
     )
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_bloom_560m_sharded.py b/integration-tests/models/test_bloom_560m_sharded.py
index bfb70253..25f6b2d7 100644
--- a/integration-tests/models/test_bloom_560m_sharded.py
+++ b/integration-tests/models/test_bloom_560m_sharded.py
@@ -1,18 +1,20 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def bloom_560m_sharded(launcher):
-    with launcher("bigscience/bloom-560m", num_shard=2) as client:
-        yield client
+def bloom_560m_sharded_handle(launcher):
+    with launcher("bigscience/bloom-560m", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def bloom_560m_sharded(bloom_560m_sharded_handle):
+    await bloom_560m_sharded_handle.health(60)
+    return bloom_560m_sharded_handle.client
 
 
 @pytest.mark.asyncio
-async def test_bloom_560m_sharded(bloom_560m_sharded, snapshot_test):
-    await health_check(bloom_560m_sharded, 60)
-
+async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
     response = await bloom_560m_sharded.generate(
         "Pour déguster un ortolan, il faut tout d'abord",
         max_new_tokens=10,
@@ -21,15 +23,13 @@ async def test_bloom_560m_sharded(bloom_560m_sharded, snapshot_test):
     )
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
 async def test_bloom_560m_sharded_load(
-    bloom_560m_sharded, generate_load, snapshot_test
+    bloom_560m_sharded, generate_load, response_snapshot
 ):
-    await health_check(bloom_560m_sharded, 60)
-
     responses = await generate_load(
         bloom_560m_sharded,
         "Pour déguster un ortolan, il faut tout d'abord",
@@ -38,5 +38,6 @@ async def test_bloom_560m_sharded_load(
     )
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_llama.py b/integration-tests/models/test_flash_llama.py
index 4d1f2bcf..37468455 100644
--- a/integration-tests/models/test_flash_llama.py
+++ b/integration-tests/models/test_flash_llama.py
@@ -1,30 +1,30 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def flash_llama(launcher):
-    with launcher("huggingface/llama-7b", num_shard=2) as client:
-        yield client
+def flash_llama_handle(launcher):
+    with launcher("huggingface/llama-7b", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama(flash_llama_handle):
+    await flash_llama_handle.health(120)
+    return flash_llama_handle.client
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama(flash_llama, snapshot_test):
-    await health_check(flash_llama, 120)
-
+async def test_flash_llama(flash_llama, response_snapshot):
     response = await flash_llama.generate("Test request", max_new_tokens=10)
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_all_params(flash_llama, snapshot_test):
-    await health_check(flash_llama, 120)
-
+async def test_flash_llama_all_params(flash_llama, response_snapshot):
     response = await flash_llama.generate(
         "Test request",
         max_new_tokens=10,
@@ -41,16 +41,15 @@ async def test_flash_llama_all_params(flash_llama, snapshot_test):
     )
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_load(flash_llama, generate_load, snapshot_test):
-    await health_check(flash_llama, 120)
-
+async def test_flash_llama_load(flash_llama, generate_load, response_snapshot):
     responses = await generate_load(flash_llama, "Test request", max_new_tokens=10, n=4)
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_neox.py b/integration-tests/models/test_flash_neox.py
index 8c981028..56cbf270 100644
--- a/integration-tests/models/test_flash_neox.py
+++ b/integration-tests/models/test_flash_neox.py
@@ -1,31 +1,31 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def flash_neox(launcher):
-    with launcher("OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2) as client:
-        yield client
+def flash_neox_handle(launcher):
+    with launcher("OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_neox(flash_neox_handle):
+    await flash_neox_handle.health(240)
+    return flash_neox_handle.client
 
 
 @pytest.mark.asyncio
-async def test_flash_neox(flash_neox, snapshot_test):
-    await health_check(flash_neox, 240)
-
+async def test_flash_neox(flash_neox, response_snapshot):
     response = await flash_neox.generate(
         "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
         max_new_tokens=10,
     )
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
-async def test_flash_neox_load(flash_neox, generate_load, snapshot_test):
-    await health_check(flash_neox, 240)
-
+async def test_flash_neox_load(flash_neox, generate_load, response_snapshot):
     responses = await generate_load(
         flash_neox,
         "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
@@ -34,5 +34,6 @@ async def test_flash_neox_load(flash_neox, generate_load, snapshot_test):
     )
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_santacoder.py b/integration-tests/models/test_flash_santacoder.py
index 64a59d78..b0cb4522 100644
--- a/integration-tests/models/test_flash_santacoder.py
+++ b/integration-tests/models/test_flash_santacoder.py
@@ -1,32 +1,35 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def flash_santacoder(launcher):
-    with launcher("bigcode/santacoder") as client:
-        yield client
+def flash_santacoder_handle(launcher):
+    with launcher("bigcode/santacoder") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_santacoder(flash_santacoder_handle):
+    await flash_santacoder_handle.health(240)
+    return flash_santacoder_handle.client
 
 
 @pytest.mark.asyncio
-async def test_flash_santacoder(flash_santacoder, snapshot_test):
-    await health_check(flash_santacoder, 60)
-
+async def test_flash_santacoder(flash_santacoder, response_snapshot):
     response = await flash_santacoder.generate("def print_hello", max_new_tokens=10)
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
-async def test_flash_santacoder_load(flash_santacoder, generate_load, snapshot_test):
-    await health_check(flash_santacoder, 60)
-
+async def test_flash_santacoder_load(
+    flash_santacoder, generate_load, response_snapshot
+):
     responses = await generate_load(
         flash_santacoder, "def print_hello", max_new_tokens=10, n=4
     )
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_starcoder.py b/integration-tests/models/test_flash_starcoder.py
index d43e92dc..4c7393a7 100644
--- a/integration-tests/models/test_flash_starcoder.py
+++ b/integration-tests/models/test_flash_starcoder.py
@@ -1,47 +1,46 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def flash_starcoder(launcher):
-    with launcher("bigcode/starcoder", num_shard=2) as client:
-        yield client
+def flash_starcoder_handle(launcher):
+    with launcher("bigcode/starcoder", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_starcoder(flash_starcoder_handle):
+    await flash_starcoder_handle.health(240)
+    return flash_starcoder_handle.client
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_starcoder(flash_starcoder, snapshot_test):
-    await health_check(flash_starcoder, 240)
-
+async def test_flash_starcoder(flash_starcoder, response_snapshot):
     response = await flash_starcoder.generate("def print_hello", max_new_tokens=10)
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_starcoder_default_params(flash_starcoder, snapshot_test):
-    await health_check(flash_starcoder, 240)
-
+async def test_flash_starcoder_default_params(flash_starcoder, response_snapshot):
     response = await flash_starcoder.generate(
         "def print_hello", max_new_tokens=60, temperature=0.2, top_p=0.95, seed=0
     )
 
     assert response.details.generated_tokens == 12
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_starcoder_load(flash_starcoder, generate_load, snapshot_test):
-    await health_check(flash_starcoder, 240)
-
+async def test_flash_starcoder_load(flash_starcoder, generate_load, response_snapshot):
     responses = await generate_load(
         flash_starcoder, "def print_hello", max_new_tokens=10, n=4
     )
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_mt0_base.py b/integration-tests/models/test_mt0_base.py
index 7310a30f..15410f73 100644
--- a/integration-tests/models/test_mt0_base.py
+++ b/integration-tests/models/test_mt0_base.py
@@ -1,18 +1,20 @@
 import pytest
 
-from utils import health_check
-
 
 @pytest.fixture(scope="module")
-def mt0_base(launcher):
-    with launcher("bigscience/mt0-base") as client:
-        yield client
+def mt0_base_handle(launcher):
+    with launcher("bigscience/mt0-base") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def mt0_base(mt0_base_handle):
+    await mt0_base_handle.health(60)
+    return mt0_base_handle.client
 
 
 @pytest.mark.asyncio
-async def test_mt0_base(mt0_base, snapshot_test):
-    await health_check(mt0_base, 60)
-
+async def test_mt0_base(mt0_base, response_snapshot):
     response = await mt0_base.generate(
         "Why is the sky blue?",
         max_new_tokens=10,
@@ -21,13 +23,11 @@ async def test_mt0_base(mt0_base, snapshot_test):
     )
 
     assert response.details.generated_tokens == 5
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
-async def test_mt0_base_all_params(mt0_base, snapshot_test):
-    await health_check(mt0_base, 60)
-
+async def test_mt0_base_all_params(mt0_base, response_snapshot):
     response = await mt0_base.generate(
         "Why is the sky blue?",
         max_new_tokens=10,
@@ -44,13 +44,11 @@ async def test_mt0_base_all_params(mt0_base, snapshot_test):
     )
 
     assert response.details.generated_tokens == 10
-    assert snapshot_test(response)
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
-async def test_mt0_base_load(mt0_base, generate_load, snapshot_test):
-    await health_check(mt0_base, 60)
-
+async def test_mt0_base_load(mt0_base, generate_load, response_snapshot):
     responses = await generate_load(
         mt0_base,
         "Why is the sky blue?",
@@ -59,5 +57,6 @@ async def test_mt0_base_load(mt0_base, generate_load, snapshot_test):
     )
 
     assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert snapshot_test(responses)
+    assert responses == response_snapshot
diff --git a/integration-tests/models/utils.py b/integration-tests/models/utils.py
deleted file mode 100644
index c47e4871..00000000
--- a/integration-tests/models/utils.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import time
-
-from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
-from text_generation import AsyncClient
-
-
-async def health_check(client: AsyncClient, timeout: int = 60):
-    assert timeout > 0
-    for _ in range(timeout):
-        try:
-            await client.generate("test")
-            return
-        except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
-            time.sleep(1)
-    raise RuntimeError("Health check failed")
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index 9ecbb2ee..051730ff 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,5 +1,5 @@
 syrupy
-text-generation==0.5.1
+text-generation==0.5.2
 pytest
 pytest-asyncio==0.17.2
 docker
\ No newline at end of file

From 5a58226130fcd35774cd14d8c5ba638a4adb2bf4 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 16 May 2023 23:23:27 +0200
Subject: [PATCH 39/39] fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .github/workflows/build.yaml                  |   3 +-
 .github/workflows/tests.yaml                  |   3 +-
 Makefile                                      |   2 +-
 integration-tests/conftest.py                 |  16 +-
 .../test_flash_llama_load.json                | 176 +++++++++---------
 .../test_mt0_base/test_mt0_base.json          |   2 +-
 .../test_mt0_base_all_params.json             |   2 +-
 .../test_mt0_base/test_mt0_base_load.json     | 114 ++++++------
 router/src/main.rs                            |   2 +-
 server/Makefile                               |   2 +-
 server/tests/models/test_model.py             |  78 ++++++++
 server/tests/models/test_seq2seq_lm.py        |   2 +-
 server/text_generation_server/models/bloom.py |   5 +-
 .../models/causal_lm.py                       |  71 +++----
 .../models/flash_causal_lm.py                 |  77 ++++----
 .../models/flash_llama.py                     |   4 +-
 .../models/flash_neox.py                      |   2 +-
 .../models/flash_santacoder.py                |   6 +-
 .../models/galactica.py                       |  16 +-
 .../text_generation_server/models/gpt_neox.py |   2 +-
 server/text_generation_server/models/model.py |  68 +++----
 server/text_generation_server/models/opt.py   |   2 +-
 .../models/santacoder.py                      |  20 +-
 .../models/seq2seq_lm.py                      |  66 +++----
 server/text_generation_server/models/t5.py    |   5 +-
 25 files changed, 396 insertions(+), 350 deletions(-)
 create mode 100644 server/tests/models/test_model.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 79b3c777..9305b8e7 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -213,12 +213,13 @@ jobs:
           sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
       - name: Install
         run: |
+          pip install pytest-xdist 
           make install-integration-tests
       - name: Run tests
         run: |
           export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
           export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv integration-tests
+          pytest -s -vv -n 2 --dist loadfile integration-tests
 
   stop-runner:
     name: Stop self-hosted EC2 runner
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index d9858a3b..7e5ba52c 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -66,7 +66,8 @@ jobs:
       - name: Run server tests
         run: |
           pip install pytest
-          make python-server-tests
+          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          pytest -s -vv server/tests
       - name: Run Rust fmt
         run: |
           cargo fmt --check
diff --git a/Makefile b/Makefile
index 29c318fa..7309aaee 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ update-integration-tests: install-integration-tests
 	pytest -s -vv --snapshot-update integration-tests
 
 python-server-tests:
-	HF_HUB_ENABLE_HF_TRANSFER=1 pytest server/tests
+	HF_HUB_ENABLE_HF_TRANSFER=1 pytest -s -vv -m "not private" server/tests
 
 python-client-tests:
 	pytest clients/python/tests
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index ba1abca9..3086ecda 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,3 +1,4 @@
+import sys
 import subprocess
 import contextlib
 import pytest
@@ -7,6 +8,7 @@ import docker
 import json
 import math
 import time
+import random
 
 from docker.errors import NotFound
 from typing import Optional, List, Dict
@@ -205,10 +207,12 @@ def launcher(event_loop):
     def local_launcher(
         model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
     ):
-        port = 9999
-        master_port = 19999
+        port = random.randint(8000, 10_000)
+        master_port = random.randint(10_000, 20_000)
 
-        shard_uds_path = f"/tmp/{model_id.replace('/', '--')}-server"
+        shard_uds_path = (
+            f"/tmp/tgi-tests-{model_id.split('/')[-1]}-{num_shard}-{quantize}-server"
+        )
 
         args = [
             "text-generation-launcher",
@@ -236,7 +240,7 @@ def launcher(event_loop):
             process.wait(60)
 
             launcher_output = process.stdout.read().decode("utf-8")
-            print(launcher_output)
+            print(launcher_output, file=sys.stderr)
 
             process.stdout.close()
             process.stderr.close()
@@ -245,7 +249,7 @@ def launcher(event_loop):
     def docker_launcher(
         model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
     ):
-        port = 9999
+        port = random.randint(8000, 10_000)
 
         args = ["--model-id", model_id, "--env"]
 
@@ -298,7 +302,7 @@ def launcher(event_loop):
             pass
 
         container_output = container.logs().decode("utf-8")
-        print(container_output)
+        print(container_output, file=sys.stderr)
 
         container.remove()
 
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
index 5a8ba217..9bbb5322 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
@@ -1,92 +1,4 @@
 [
-  {
-    "details": {
-      "best_of_sequences": null,
-      "finish_reason": "length",
-      "generated_tokens": 10,
-      "prefill": [
-        {
-          "id": 1,
-          "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -8.6875,
-          "text": "Test"
-        },
-        {
-          "id": 2009,
-          "logprob": -11.5546875,
-          "text": "request"
-        }
-      ],
-      "seed": null,
-      "tokens": [
-        {
-          "id": 363,
-          "logprob": -1.5322266,
-          "special": false,
-          "text": " for"
-        },
-        {
-          "id": 847,
-          "logprob": -2.5585938,
-          "special": false,
-          "text": " /"
-        },
-        {
-          "id": 2754,
-          "logprob": -2.265625,
-          "special": false,
-          "text": "api"
-        },
-        {
-          "id": 29914,
-          "logprob": -0.034088135,
-          "special": false,
-          "text": "/"
-        },
-        {
-          "id": 29894,
-          "logprob": -0.96240234,
-          "special": false,
-          "text": "v"
-        },
-        {
-          "id": 29896,
-          "logprob": -0.36816406,
-          "special": false,
-          "text": "1"
-        },
-        {
-          "id": 29914,
-          "logprob": -0.013191223,
-          "special": false,
-          "text": "/"
-        },
-        {
-          "id": 16418,
-          "logprob": -3.15625,
-          "special": false,
-          "text": "projects"
-        },
-        {
-          "id": 29914,
-          "logprob": -0.43774414,
-          "special": false,
-          "text": "/"
-        },
-        {
-          "id": 29896,
-          "logprob": -1.9443359,
-          "special": false,
-          "text": "1"
-        }
-      ]
-    },
-    "generated_text": "for /api/v1/projects/1"
-  },
   {
     "details": {
       "best_of_sequences": null,
@@ -263,6 +175,94 @@
     },
     "generated_text": "for /api/v1/projects/1"
   },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.6875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.5546875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 363,
+          "logprob": -1.5322266,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 847,
+          "logprob": -2.5585938,
+          "special": false,
+          "text": " /"
+        },
+        {
+          "id": 2754,
+          "logprob": -2.265625,
+          "special": false,
+          "text": "api"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.034088135,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29894,
+          "logprob": -0.96240234,
+          "special": false,
+          "text": "v"
+        },
+        {
+          "id": 29896,
+          "logprob": -0.36816406,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.013191223,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 16418,
+          "logprob": -3.15625,
+          "special": false,
+          "text": "projects"
+        },
+        {
+          "id": 29914,
+          "logprob": -0.43774414,
+          "special": false,
+          "text": "/"
+        },
+        {
+          "id": 29896,
+          "logprob": -1.9443359,
+          "special": false,
+          "text": "1"
+        }
+      ]
+    },
+    "generated_text": "for /api/v1/projects/1"
+  },
   {
     "details": {
       "best_of_sequences": null,
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
index 2a26e3db..c1cd24cd 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
@@ -16,7 +16,7 @@
         "id": 926,
         "logprob": -4.3554688,
         "special": false,
-        "text": "To"
+        "text": " To"
       },
       {
         "id": 18295,
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
index fd77252d..3e9f3d73 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -16,7 +16,7 @@
         "id": 16017,
         "logprob": -1.3505859,
         "special": false,
-        "text": "blue"
+        "text": " blue"
       },
       {
         "id": 20495,
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
index c9e552b6..c0834ae1 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
@@ -1,58 +1,4 @@
 [
-  {
-    "details": {
-      "best_of_sequences": null,
-      "finish_reason": "eos_token",
-      "generated_tokens": 6,
-      "prefill": [
-        {
-          "id": 0,
-          "logprob": null,
-          "text": "<pad>"
-        }
-      ],
-      "seed": null,
-      "tokens": [
-        {
-          "id": 259,
-          "logprob": -1.3789062,
-          "special": false,
-          "text": ""
-        },
-        {
-          "id": 39261,
-          "logprob": -0.36279297,
-          "special": false,
-          "text": "Because"
-        },
-        {
-          "id": 609,
-          "logprob": -1.0966797,
-          "special": false,
-          "text": " it"
-        },
-        {
-          "id": 339,
-          "logprob": -0.8276367,
-          "special": false,
-          "text": " is"
-        },
-        {
-          "id": 16017,
-          "logprob": -1.6845703,
-          "special": false,
-          "text": " blue"
-        },
-        {
-          "id": 1,
-          "logprob": -0.72753906,
-          "special": true,
-          "text": "</s>"
-        }
-      ]
-    },
-    "generated_text": "Because it is blue"
-  },
   {
     "details": {
       "best_of_sequences": null,
@@ -71,7 +17,7 @@
           "id": 259,
           "logprob": -1.3798828,
           "special": false,
-          "text": ""
+          "text": " "
         },
         {
           "id": 39261,
@@ -125,7 +71,7 @@
           "id": 259,
           "logprob": -1.3789062,
           "special": false,
-          "text": ""
+          "text": " "
         },
         {
           "id": 39261,
@@ -179,7 +125,61 @@
           "id": 259,
           "logprob": -1.3789062,
           "special": false,
-          "text": ""
+          "text": " "
+        },
+        {
+          "id": 39261,
+          "logprob": -0.36279297,
+          "special": false,
+          "text": "Because"
+        },
+        {
+          "id": 609,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 339,
+          "logprob": -0.8276367,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 16017,
+          "logprob": -1.6845703,
+          "special": false,
+          "text": " blue"
+        },
+        {
+          "id": 1,
+          "logprob": -0.72753906,
+          "special": true,
+          "text": "</s>"
+        }
+      ]
+    },
+    "generated_text": "Because it is blue"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "eos_token",
+      "generated_tokens": 6,
+      "prefill": [
+        {
+          "id": 0,
+          "logprob": null,
+          "text": "<pad>"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 259,
+          "logprob": -1.3789062,
+          "special": false,
+          "text": " "
         },
         {
           "id": 39261,
diff --git a/router/src/main.rs b/router/src/main.rs
index 5ad49003..82bf6ba8 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -146,7 +146,7 @@ fn main() -> Result<(), std::io::Error> {
                     sha: None,
                     pipeline_tag: None,
                 },
-                false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or({
+                false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or_else(|| {
                     tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
                     HubModelInfo { model_id: tokenizer_name.to_string(), sha: None, pipeline_tag: None }
                 }),
diff --git a/server/Makefile b/server/Makefile
index 150d7e4a..6eb56c75 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -2,7 +2,7 @@ include Makefile-transformers
 include Makefile-flash-att
 
 unit-tests:
-	python -m pytest tests
+	pytest -s -vv -m "not private" tests
 
 gen-server:
 	# Compile protos
diff --git a/server/tests/models/test_model.py b/server/tests/models/test_model.py
new file mode 100644
index 00000000..32bcd45f
--- /dev/null
+++ b/server/tests/models/test_model.py
@@ -0,0 +1,78 @@
+import pytest
+import torch
+
+from transformers import AutoTokenizer
+
+from text_generation_server.models import Model
+
+
+def get_test_model():
+    class TestModel(Model):
+        def batch_type(self):
+            raise NotImplementedError
+
+        def generate_token(self, batch):
+            raise NotImplementedError
+
+    tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")
+
+    model = TestModel(
+        torch.nn.Linear(1, 1), tokenizer, False, torch.float32, torch.device("cpu")
+    )
+    return model
+
+
+@pytest.mark.private
+def test_decode_streaming_english_spaces():
+    model = get_test_model()
+    truth = "Hello here, this is a simple test"
+    all_input_ids = [15043, 1244, 29892, 445, 338, 263, 2560, 1243]
+    assert (
+        all_input_ids == model.tokenizer(truth, add_special_tokens=False)["input_ids"]
+    )
+
+    decoded_text = ""
+    offset = 0
+    token_offset = 0
+    for i in range(len(all_input_ids)):
+        text, offset, token_offset = model.decode_token(
+            all_input_ids[: i + 1], offset, token_offset
+        )
+        decoded_text += text
+
+    assert decoded_text == truth
+
+
+@pytest.mark.private
+def test_decode_streaming_chinese_utf8():
+    model = get_test_model()
+    truth = "我很感谢你的热情"
+    all_input_ids = [
+        30672,
+        232,
+        193,
+        139,
+        233,
+        135,
+        162,
+        235,
+        179,
+        165,
+        30919,
+        30210,
+        234,
+        134,
+        176,
+        30993,
+    ]
+
+    decoded_text = ""
+    offset = 0
+    token_offset = 0
+    for i in range(len(all_input_ids)):
+        text, offset, token_offset = model.decode_token(
+            all_input_ids[: i + 1], offset, token_offset
+        )
+        decoded_text += text
+
+    assert decoded_text == truth
diff --git a/server/tests/models/test_seq2seq_lm.py b/server/tests/models/test_seq2seq_lm.py
index 65dafa50..ba769e75 100644
--- a/server/tests/models/test_seq2seq_lm.py
+++ b/server/tests/models/test_seq2seq_lm.py
@@ -149,7 +149,7 @@ def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch)
     assert all([generation.generated_text is None for generation in generations])
     assert all([len(generation.prefill_tokens) == 1 for generation in generations])
     assert all([generation.token_id.item() == 259 for generation in generations])
-    assert all([generation.token_text == "" for generation in generations])
+    assert all([generation.token_text == " " for generation in generations])
     assert generations[0].request_id == 0
 
 
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 9029e954..1f324f77 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -56,7 +56,7 @@ class BLOOM(CausalLM):
         quantize: Optional[str] = None,
     ):
         super(BLOOM, self).__init__(
-            model_id=model_id, revision=revision, quantize=quantize, decode_buffer=1
+            model_id=model_id, revision=revision, quantize=quantize
         )
 
     @property
@@ -104,14 +104,13 @@ class BLOOMSharded(BLOOM):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
         super(CausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
-            decode_buffer=1,
             rank=rank,
             world_size=world_size,
         )
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 0d521ac4..9d8ae254 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -35,8 +35,8 @@ class CausalLMBatch(Batch):
 
     # Lengths of all generations present in the batch
     input_lengths: List[int]
-    offsets: List[Optional[int]]
-    token_offsets: List[Optional[int]]
+    prefix_offsets: List[int]
+    read_offsets: List[int]
 
     # Generation helpers
     next_token_choosers: List[NextTokenChooser]
@@ -70,8 +70,8 @@ class CausalLMBatch(Batch):
         inputs = []
         next_token_choosers = []
         stopping_criterias = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         requests_idx_mapping = {}
 
         # Parse batch
@@ -81,8 +81,6 @@ class CausalLMBatch(Batch):
         for i, r in enumerate(pb.requests):
             requests_idx_mapping[r.id] = i
             inputs.append(r.inputs)
-            offsets.append(None)
-            token_offsets.append(None)
             next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
@@ -102,6 +100,10 @@ class CausalLMBatch(Batch):
             truncation=True,
             max_length=max_truncation,
         ).to(device)
+        for _ in pb.requests:
+            input_len = tokenized_inputs["input_ids"].shape[1]
+            prefix_offsets.append(0)
+            read_offsets.append(input_len)
 
         input_lengths = tokenized_inputs["attention_mask"].sum(1)
         max_input_length = input_lengths.max()
@@ -130,8 +132,8 @@ class CausalLMBatch(Batch):
             past_key_values=None,
             all_input_ids=list(all_input_ids),
             input_lengths=input_lengths.tolist(),
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             max_input_length=max_input_length.item(),
@@ -151,8 +153,8 @@ class CausalLMBatch(Batch):
         # New values after filtering
         requests_idx_mapping = {}
         input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         all_input_ids = []
         max_input_length = 0
 
@@ -167,8 +169,8 @@ class CausalLMBatch(Batch):
             requests_idx_mapping[r.id] = i
             keep_indices.append(idx)
 
-            offsets.append(self.offsets[idx])
-            token_offsets.append(self.token_offsets[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
             all_input_ids.append(self.all_input_ids[idx])
 
             request_input_length = self.input_lengths[idx]
@@ -225,8 +227,8 @@ class CausalLMBatch(Batch):
         self.position_ids = position_ids
         self.all_input_ids = all_input_ids
         self.input_lengths = input_lengths
-        self.offsets = offsets
-        self.token_offsets = token_offsets
+        self.prefix_offsets = prefix_offsets
+        self.read_offsets = read_offsets
         self.next_token_choosers = next_token_choosers
         self.stopping_criterias = stopping_criterias
         self.max_input_length = max_input_length
@@ -251,8 +253,8 @@ class CausalLMBatch(Batch):
         requests = []
         requests_idx_mapping = {}
         input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         all_input_ids = []
         next_token_choosers = []
         stopping_criterias = []
@@ -270,8 +272,8 @@ class CausalLMBatch(Batch):
         for i, batch in enumerate(batches):
             requests.extend(batch.requests)
             input_lengths.extend(batch.input_lengths)
-            offsets.extend(batch.offsets)
-            token_offsets.extend(batch.token_offsets)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
             all_input_ids.extend(batch.all_input_ids)
             next_token_choosers.extend(batch.next_token_choosers)
             stopping_criterias.extend(batch.stopping_criterias)
@@ -428,8 +430,8 @@ class CausalLMBatch(Batch):
             past_key_values=past_key_values,
             all_input_ids=all_input_ids,
             input_lengths=input_lengths,
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             max_input_length=max_input_length,
@@ -448,7 +450,6 @@ class CausalLM(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        decode_buffer: int = 3,
     ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
@@ -463,25 +464,25 @@ class CausalLM(Model):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, revision=revision, padding_side="left", truncation_side="left"
         )
-        self.model = AutoModelForCausalLM.from_pretrained(
+        model = AutoModelForCausalLM.from_pretrained(
             model_id,
             revision=revision,
             torch_dtype=dtype,
             device_map="auto" if torch.cuda.is_available() else None,
             load_in_8bit=quantize == "bitsandbytes",
-        ).eval()
+        )
         tokenizer.pad_token_id = (
-            self.model.config.pad_token_id
-            if self.model.config.pad_token_id is not None
-            else self.model.config.eos_token_id
+            model.config.pad_token_id
+            if model.config.pad_token_id is not None
+            else model.config.eos_token_id
         )
 
         super(CausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
-            decode_buffer=decode_buffer,
         )
 
     @property
@@ -528,8 +529,8 @@ class CausalLM(Model):
         iterator = zip(
             batch.requests,
             batch.input_lengths,
-            batch.offsets,
-            batch.token_offsets,
+            batch.prefix_offsets,
+            batch.read_offsets,
             logits,
             batch.next_token_choosers,
             batch.stopping_criterias,
@@ -540,8 +541,8 @@ class CausalLM(Model):
         for i, (
             request,
             input_length,
-            offset,
-            token_offset,
+            prefix_offset,
+            read_offset,
             logits,
             next_token_chooser,
             stopping_criteria,
@@ -559,8 +560,8 @@ class CausalLM(Model):
             # Generated token
             next_token_logprob = logprobs[-1, next_token_id]
             next_token_id_squeezed = next_token_id.squeeze()
-            next_token_text, offset, token_offset = self.decode_token(
-                all_input_ids[:, 0], offset, token_offset
+            next_token_text, prefix_offset, read_offset = self.decode_token(
+                all_input_ids[:, 0], prefix_offset, read_offset
             )
 
             # Evaluate stopping criteria
@@ -628,8 +629,8 @@ class CausalLM(Model):
             batch.input_ids[i, 0] = next_token_id
             batch.all_input_ids[i] = all_input_ids
             batch.input_lengths[i] = new_input_length
-            batch.offsets[i] = offset
-            batch.token_offsets[i] = token_offset
+            batch.prefix_offsets[i] = prefix_offset
+            batch.read_offsets[i] = read_offset
             batch.max_input_length = max(batch.max_input_length, new_input_length)
 
         # We finished all generations in the batch; there is no next batch
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 0a9fccca..aee0480d 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -52,8 +52,8 @@ class FlashCausalLMBatch(Batch):
 
     # Lengths of all generations present in the batch
     input_lengths: List[int]
-    offsets: List[Optional[int]]
-    token_offsets: List[Optional[int]]
+    prefix_offsets: List[Optional[int]]
+    read_offsets: List[Optional[int]]
 
     # Generation helpers
     next_token_choosers: List[NextTokenChooser]
@@ -82,8 +82,8 @@ class FlashCausalLMBatch(Batch):
         max_seqlen = 0
 
         input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         all_input_ids = []
         requests_idx_mapping = {}
 
@@ -108,8 +108,8 @@ class FlashCausalLMBatch(Batch):
             max_seqlen = max(max_seqlen, input_length)
             input_lengths.append(input_length)
 
-            offsets.append(None)
-            token_offsets.append(None)
+            prefix_offsets.append(0)
+            read_offsets.append(input_length)
 
             all_input_ids.append(tokenized_input)
 
@@ -151,8 +151,8 @@ class FlashCausalLMBatch(Batch):
             max_seqlen=max_seqlen,
             past_key_values=None,
             input_lengths=input_lengths,
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=[],
             next_token_choosers=next_token_choosers,
@@ -190,8 +190,8 @@ class FlashCausalLMBatch(Batch):
         all_input_ids_tensor = []
 
         input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
 
         next_token_choosers = []
         stopping_criterias = []
@@ -222,8 +222,8 @@ class FlashCausalLMBatch(Batch):
             all_input_ids_tensor.append(self.all_input_ids_tensor[idx])
 
             input_lengths.append(request_input_length)
-            offsets.append(self.offsets[idx])
-            token_offsets.append(self.token_offsets[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
 
             next_token_choosers.append(self.next_token_choosers[idx])
 
@@ -269,8 +269,8 @@ class FlashCausalLMBatch(Batch):
             max_seqlen=max_seqlen,
             past_key_values=past_key_values,
             input_lengths=input_lengths,
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
             next_token_choosers=next_token_choosers,
@@ -302,8 +302,8 @@ class FlashCausalLMBatch(Batch):
         all_input_ids_tensor = []
 
         input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
 
         next_token_choosers = []
         stopping_criterias = []
@@ -347,8 +347,8 @@ class FlashCausalLMBatch(Batch):
             all_input_ids_tensor.extend(batch.all_input_ids_tensor)
 
             input_lengths.extend(batch.input_lengths)
-            offsets.extend(batch.offsets)
-            token_offsets.extend(batch.token_offsets)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
 
             next_token_choosers.extend(batch.next_token_choosers)
             stopping_criterias.extend(batch.stopping_criterias)
@@ -374,8 +374,8 @@ class FlashCausalLMBatch(Batch):
             max_seqlen=max_seqlen,
             past_key_values=past_key_values,
             input_lengths=input_lengths,
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
             next_token_choosers=next_token_choosers,
@@ -394,7 +394,6 @@ class FlashCausalLM(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        decode_buffer: int = 3,
     ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
@@ -405,23 +404,19 @@ class FlashCausalLM(Model):
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, revision=revision, padding_side="left", truncation_side="left"
         )
-        self.model = (
-            model_cls.from_pretrained(
-                model_id,
-                revision=revision,
-                torch_dtype=dtype,
-                load_in_8bit=quantize == "bitsandbytes",
-            )
-            .eval()
-            .to(device)
-        )
+        model = model_cls.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            load_in_8bit=quantize == "bitsandbytes",
+        ).to(device)
 
         super(FlashCausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=False,
             dtype=dtype,
             device=device,
-            decode_buffer=decode_buffer,
         )
 
     @property
@@ -645,8 +640,8 @@ class FlashCausalLM(Model):
         iterator = zip(
             batch.requests,
             batch.input_lengths,
-            batch.offsets,
-            batch.token_offsets,
+            batch.prefix_offsets,
+            batch.read_offsets,
             batch.next_token_choosers,
             batch.stopping_criterias,
             batch.all_input_ids,
@@ -659,8 +654,8 @@ class FlashCausalLM(Model):
         for i, (
             request,
             input_length,
-            offset,
-            token_offset,
+            prefix_offset,
+            read_offset,
             next_token_chooser,
             stopping_criteria,
             all_input_ids,
@@ -675,10 +670,10 @@ class FlashCausalLM(Model):
             all_input_ids.append(next_token_id)
 
             # Generated token
-            next_token_text, offset, token_offset = self.decode_token(
+            next_token_text, prefix_offset, read_offset = self.decode_token(
                 all_input_ids,
-                offset,
-                token_offset,
+                prefix_offset,
+                read_offset,
             )
 
             # Evaluate stopping criteria
@@ -744,8 +739,8 @@ class FlashCausalLM(Model):
 
             # Update values
             batch.input_lengths[i] = new_input_length
-            batch.offsets[i] = offset
-            batch.token_offsets[i] = token_offset
+            batch.prefix_offsets[i] = prefix_offset
+            batch.read_offsets[i] = read_offset
             batch.all_input_ids[i] = all_input_ids
             batch.max_seqlen = batch.max_seqlen + 1
             cumulative_length += input_length
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index b775bd79..ebdbe206 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -64,9 +64,9 @@ class FlashLlama(FlashCausalLM):
             model = FlashLlamaForCausalLM(config)
 
         self.load_weights(model, filenames, quantize, device, dtype)
-        self.model = model.eval().to(device)
 
         super(FlashCausalLM, self).__init__(
+            model=model.to(device),
             tokenizer=tokenizer,
             requires_padding=False,
             dtype=dtype,
@@ -189,9 +189,9 @@ class FlashLlamaSharded(FlashLlama):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval().to(device)
         torch.distributed.barrier(group=self.process_group)
         super(FlashCausalLM, self).__init__(
+            model=model.to(device),
             tokenizer=tokenizer,
             requires_padding=False,
             dtype=dtype,
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index 0924f107..cac40bab 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -73,9 +73,9 @@ class FlashNeoXSharded(FlashNeoX):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval().to(device)
         torch.distributed.barrier(group=self.process_group)
         super(FlashCausalLM, self).__init__(
+            model=model.to(device),
             tokenizer=tokenizer,
             requires_padding=False,
             dtype=dtype,
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 031a67eb..5dc31309 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -67,14 +67,13 @@ class FlashSantacoder(FlashCausalLM):
             dtype,
             config.architectures[0].startswith("GPT2"),
         )
-        self.model = model.eval().to(device)
 
         super(FlashCausalLM, self).__init__(
+            model=model.to(device),
             tokenizer=tokenizer,
             requires_padding=False,
             dtype=dtype,
             device=device,
-            decode_buffer=1,
         )
 
     @staticmethod
@@ -213,16 +212,15 @@ class FlashSantacoderSharded(FlashSantacoder):
             world_size=world_size,
             transpose=config.architectures[0].startswith("GPT2"),
         )
-        self.model = model.eval().to(device)
         torch.distributed.barrier(group=self.process_group)
         super(FlashCausalLM, self).__init__(
+            model=model.to(device),
             tokenizer=tokenizer,
             requires_padding=False,
             dtype=dtype,
             device=device,
             rank=rank,
             world_size=world_size,
-            decode_buffer=1,
         )
 
     @staticmethod
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index d1e5e841..24c37c19 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -94,8 +94,8 @@ class GalacticaCausalLMBatch(CausalLMBatch):
         inputs = []
         next_token_choosers = []
         stopping_criterias = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         requests_idx_mapping = {}
 
         # Parse batch
@@ -106,8 +106,6 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             requests_idx_mapping[r.id] = i
             # Add escape_custom_split_sequence to the CausalLMBatch logic
             inputs.append(escape_custom_split_sequence(r.inputs))
-            offsets.append(None)
-            token_offsets.append(None)
             next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
@@ -127,6 +125,10 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             truncation=True,
             max_length=max_truncation,
         ).to(device)
+        for _ in pb.requests:
+            input_len = tokenized_inputs["input_ids"].shape[1]
+            prefix_offsets.append(0)
+            read_offsets.append(input_len)
 
         input_lengths = tokenized_inputs["attention_mask"].sum(1)
         max_input_length = input_lengths.max()
@@ -155,8 +157,8 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             past_key_values=None,
             all_input_ids=list(all_input_ids),
             input_lengths=input_lengths.tolist(),
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             max_input_length=max_input_length.item(),
@@ -231,9 +233,9 @@ class GalacticaSharded(Galactica):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
         super(CausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index f95e5be2..a10dfcb8 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -70,9 +70,9 @@ class GPTNeoxSharded(CausalLM):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
         super(CausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 03f14013..29bad321 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -13,23 +13,20 @@ B = TypeVar("B", bound=Batch)
 class Model(ABC):
     def __init__(
         self,
+        model: torch.nn.Module,
         tokenizer: PreTrainedTokenizerBase,
         requires_padding: bool,
         dtype: torch.dtype,
         device: torch.device,
-        decode_buffer: int = 3,
         rank: int = 0,
         world_size: int = 1,
     ):
-        if decode_buffer < 1:
-            raise ValueError("decode_buffer must be >= 1")
-
+        self.model = model.eval()
         self.tokenizer = tokenizer
         self.all_special_ids = set(tokenizer.all_special_ids)
         self.requires_padding = requires_padding
         self.dtype = dtype
         self.device = device
-        self.decode_buffer = decode_buffer
         self.rank = rank
         self.world_size = world_size
         self.check_initialized()
@@ -54,52 +51,29 @@ class Model(ABC):
     def decode_token(
         self,
         all_input_ids: List[int],
-        offset: Optional[int] = None,
-        token_offset: Optional[int] = None,
-    ) -> Tuple[str, Optional[int], Optional[int]]:
+        prefix_offset: int = 0,
+        read_offset: int = 0,
+    ) -> Tuple[str, int, int]:
         """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-        if all_input_ids[-1] in self.all_special_ids:
-            return (
-                self.tokenizer.decode(all_input_ids[-1], skip_special_tokens=False),
-                None,
-                None,
-            )
 
-        if token_offset is None:
-            token_offset = len(all_input_ids) - self.decode_buffer
-            # left token buffer
-            if self.decode_buffer > 1:
-                # Decode token_offset token minus last one and token_offset tokens
-                raw_texts = self.tokenizer.batch_decode(
-                    [all_input_ids[token_offset:-1], all_input_ids[token_offset:]],
-                    skip_special_tokens=False,
-                )
+        # The prefix text is necessary only to defeat cleanup algorithms in the decode
+        # which decide to add a space or not depending on the surrounding ids.
+        prefix_text = self.tokenizer.decode(
+            all_input_ids[prefix_offset:read_offset], skip_special_tokens=False
+        )
+        new_text = self.tokenizer.decode(
+            all_input_ids[prefix_offset:], skip_special_tokens=False
+        )
 
-                # default offset is only the last token
-                offset = len(raw_texts[0])
-                sequence_text = raw_texts[1]
-            else:
-                # Only decode the last token without using a token buffer
-                sequence_text = self.tokenizer.decode(
-                    all_input_ids[-1], skip_special_tokens=False
-                )
-                # no offset in this case
-                offset = 0
+        if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
+            # utf-8 char at the end means it's a potential unfinished byte sequence
+            # from byte fallback tokenization.
+            # If it's in the middle, it's probably a real invalid id generated
+            # by the model
+            new_text = new_text[len(prefix_text) :]
+            return new_text, read_offset, len(all_input_ids)
         else:
-            assert offset is not None
-            sequence_text = self.tokenizer.decode(
-                all_input_ids[token_offset:],
-                skip_special_tokens=False,
-            )
-
-        # get text
-        token_text = sequence_text[offset:]
-
-        # if text is utf-8
-        if token_text and token_text[-1] != "�":
-            return token_text, None, None
-        else:
-            return "", offset, token_offset
+            return "", prefix_offset, read_offset
 
     def check_initialized(self):
         uninitialized_parameters = []
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index 093cf70a..fdae795b 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -86,9 +86,9 @@ class OPTSharded(OPT):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
         super(CausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
index 4bd56de1..23f89f48 100644
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -46,24 +46,20 @@ class SantaCoder(CausalLM):
             }
         )
 
-        self.model = (
-            AutoModelForCausalLM.from_pretrained(
-                model_id,
-                revision=revision,
-                torch_dtype=dtype,
-                load_in_8bit=quantize == "bitsandbytes",
-                trust_remote_code=True,  # required
-            )
-            .to(device)
-            .eval()
-        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            load_in_8bit=quantize == "bitsandbytes",
+            trust_remote_code=True,  # required
+        ).to(device)
 
         super(CausalLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
-            decode_buffer=1,
         )
 
     def decode(self, generated_ids: List[int]) -> str:
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 84854f5d..4f55b22f 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -42,8 +42,8 @@ class Seq2SeqLMBatch(Batch):
     # Lengths of all generations present in the batch
     input_lengths: List[int]
     decoder_input_lengths: List[int]
-    offsets: List[Optional[int]]
-    token_offsets: List[Optional[int]]
+    prefix_offsets: List[int]
+    read_offsets: List[int]
 
     # Generation helpers
     next_token_choosers: List[NextTokenChooser]
@@ -79,8 +79,8 @@ class Seq2SeqLMBatch(Batch):
         stopping_criterias = []
 
         decoder_input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         requests_idx_mapping = {}
 
         # Parse batch
@@ -91,8 +91,6 @@ class Seq2SeqLMBatch(Batch):
             inputs.append(r.inputs)
             requests_idx_mapping[r.id] = i
             decoder_input_lengths.append(1)
-            offsets.append(None)
-            token_offsets.append(None)
             next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
@@ -123,6 +121,9 @@ class Seq2SeqLMBatch(Batch):
             .repeat(len(pb.requests))
             .view(-1, 1)
         )
+        for _ in pb.requests:
+            prefix_offsets.append(0)
+            read_offsets.append(1)
         all_decoder_input_ids = decoder_input_ids.view(-1).split(1)
 
         max_tokens = len(inputs) * max_input_length + max_decode_tokens
@@ -140,8 +141,8 @@ class Seq2SeqLMBatch(Batch):
             past_key_values=None,
             input_lengths=input_lengths.tolist(),
             decoder_input_lengths=decoder_input_lengths,
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             max_input_length=max_input_length.item(),
@@ -165,8 +166,8 @@ class Seq2SeqLMBatch(Batch):
         requests_idx_mapping = {}
         input_lengths = []
         decoder_input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
 
         all_decoder_input_ids = []
 
@@ -184,8 +185,8 @@ class Seq2SeqLMBatch(Batch):
             requests_idx_mapping[r.id] = i
             keep_indices.append(idx)
 
-            offsets.append(self.offsets[idx])
-            token_offsets.append(self.token_offsets[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
 
             all_decoder_input_ids.append(self.all_decoder_input_ids[idx])
 
@@ -248,8 +249,8 @@ class Seq2SeqLMBatch(Batch):
         self.all_decoder_input_ids = all_decoder_input_ids
         self.input_lengths = input_lengths
         self.decoder_input_lengths = decoder_input_lengths
-        self.offsets = offsets
-        self.token_offsets = token_offsets
+        self.prefix_offsets = prefix_offsets
+        self.read_offsets = read_offsets
         self.next_token_choosers = next_token_choosers
         self.stopping_criterias = stopping_criterias
         self.max_input_length = max_input_length
@@ -283,8 +284,8 @@ class Seq2SeqLMBatch(Batch):
         all_decoder_input_ids = []
         input_lengths = []
         decoder_input_lengths = []
-        offsets = []
-        token_offsets = []
+        prefix_offsets = []
+        read_offsets = []
         next_token_choosers = []
         stopping_criterias = []
         max_tokens = 0
@@ -306,8 +307,8 @@ class Seq2SeqLMBatch(Batch):
             all_decoder_input_ids.extend(batch.all_decoder_input_ids)
             input_lengths.extend(batch.input_lengths)
             decoder_input_lengths.extend(batch.decoder_input_lengths)
-            offsets.extend(batch.offsets)
-            token_offsets.extend(batch.token_offsets)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
             next_token_choosers.extend(batch.next_token_choosers)
             stopping_criterias.extend(batch.stopping_criterias)
 
@@ -482,8 +483,8 @@ class Seq2SeqLMBatch(Batch):
             past_key_values=past_key_values,
             input_lengths=input_lengths,
             decoder_input_lengths=decoder_input_lengths,
-            offsets=offsets,
-            token_offsets=token_offsets,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             max_input_length=max_input_length,
@@ -502,7 +503,6 @@ class Seq2SeqLM(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        decode_buffer: int = 3,
     ):
         if torch.cuda.is_available():
             device = torch.device("cuda")
@@ -514,24 +514,24 @@ class Seq2SeqLM(Model):
             device = torch.device("cpu")
             dtype = torch.float32
 
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(
+        model = AutoModelForSeq2SeqLM.from_pretrained(
             model_id,
             revision=revision,
             torch_dtype=dtype,
             device_map="auto" if torch.cuda.is_available() else None,
             load_in_8bit=quantize == "bitsandbytes",
-        ).eval()
+        )
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, revision=revision, padding_side="left", truncation_side="left"
         )
-        tokenizer.bos_token_id = self.model.config.decoder_start_token_id
+        tokenizer.bos_token_id = model.config.decoder_start_token_id
 
         super(Seq2SeqLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
-            decode_buffer=decode_buffer,
         )
 
     @property
@@ -608,8 +608,8 @@ class Seq2SeqLM(Model):
         iterator = zip(
             batch.requests,
             batch.input_lengths,
-            batch.offsets,
-            batch.token_offsets,
+            batch.prefix_offsets,
+            batch.read_offsets,
             batch.decoder_input_lengths,
             logits,
             batch.next_token_choosers,
@@ -621,8 +621,8 @@ class Seq2SeqLM(Model):
         for i, (
             request,
             input_length,
-            offset,
-            token_offset,
+            prefix_offset,
+            read_offset,
             decoder_input_length,
             logits,
             next_token_chooser,
@@ -643,8 +643,8 @@ class Seq2SeqLM(Model):
             # Generated token
             next_token_logprob = logprobs[-1, next_token_id]
             next_token_id_squeezed = next_token_id.squeeze()
-            next_token_text, offset, token_offset = self.decode_token(
-                all_decoder_input_ids, offset, token_offset
+            next_token_text, prefix_offset, read_offset = self.decode_token(
+                all_decoder_input_ids, prefix_offset, read_offset
             )
 
             # Evaluate stopping criteria
@@ -702,8 +702,8 @@ class Seq2SeqLM(Model):
             batch.all_decoder_input_ids[i] = all_decoder_input_ids
             batch.input_lengths[i] = input_length
             batch.decoder_input_lengths[i] = new_decoder_input_length
-            batch.offsets[i] = offset
-            batch.token_offsets[i] = token_offset
+            batch.prefix_offsets[i] = prefix_offset
+            batch.read_offsets[i] = read_offset
             batch.max_input_length = max(batch.max_input_length, input_length)
             batch.max_decoder_input_length = max(
                 batch.max_decoder_input_length, new_decoder_input_length
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 8e3826a4..b1ba2432 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -16,9 +16,6 @@ from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
 )
-from text_generation_server.utils.layers import (
-    FastLinear,
-)
 from transformers.models.t5.parallel_layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
@@ -73,9 +70,9 @@ class T5Sharded(Seq2SeqLM):
             rank=rank,
             world_size=world_size,
         )
-        self.model = model.eval()
         torch.distributed.barrier(group=self.process_group)
         super(Seq2SeqLM, self).__init__(
+            model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,