Merge branch 'main' into feature/get-trace-id-from-req-headers

2025-07-11 10:20:16 +00:00 · 2024-11-22 13:25:25 +09:00 · 2024-11-22 13:25:25 +09:00 · b6e3ffb037
commit b6e3ffb037
parent 14e8ca5236 ab7ccf5bc3
139 changed files with 9067 additions and 1986 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -202,4 +202,5 @@ jobs:
          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          echo $DOCKER_IMAGE
+          docker pull $DOCKER_IMAGE
          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -3,12 +3,17 @@ name: Nightly load test
 on:
  schedule:
    - cron: '0 0 * * 1-5'
+  workflow_call:
+  workflow_dispatch:

  pull_request:
    paths:
      - ".github/workflows/load_test.yaml"
-    branches:
-      - 'main'
+
+env:
+  AWS_DEFAULT_REGION: us-east-1
+  AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}

 jobs:
  load-tests:
@ -16,28 +21,30 @@ jobs:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on:
-      group: aws-g5-12xlarge
+      group: aws-g6-12xl-plus-priv-cache
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

-      - name: Install k6
-        run: |
-          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
+      - name: Install Python 3.11
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11

-      - name: Start starcoder
+      - name: Install poetry
        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
-          sleep 10
-          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+          poetry --version

-      - name: Run k6
+      - name: Run bench test
        run: |
-          ./k6 run load_tests/starcoder_load.js
-
-      - name: Stop starcoder
-        if: ${{ always() }}
-        run: |
-          docker stop tgi-starcoder || true
+          export PATH="$HOME/.local/bin:$PATH"
+          cd load_tests
+          poetry install
+          poetry run python benchmarks.py --sha ${{ github.sha }} --results-file "s3://text-generation-inference-ci/benchmarks/ci/${{ github.sha }}.parquet"
+        shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}
--- a/.github/workflows/nix_cache.yaml
+++ b/.github/workflows/nix_cache.yaml
@ -0,0 +1,34 @@
+name: "Cache devshells"
+on:
+  pull_request:
+    paths:
+      - "flake.nix"
+      - "flake.lock"
+      - "nix/**"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tests:
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    steps:
+      - uses: actions/checkout@v4
+      - uses: cachix/install-nix-action@v27
+        with:
+          nix_path: nixpkgs=channel:nixos-unstable
+      - uses: cachix/cachix-action@v14
+        with:
+          name: text-generation-inference
+          # If you chose signing key for write access
+          authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
+        env:
+          USER: github_runner
+      - name: Build impure devshell
+        run: nix build .\#devShells.x86_64-linux.impure
+      - name: Build impure devshell (CUDA dev)
+        run: nix build .\#devShells.x86_64-linux.impureWithCuda
+      # Pure shell dependencies are covered by Nix tests.
+      # - name: Build pure devshell
+      #   run: nix build .\#devShells.x86_64-linux.pure
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,8 @@ router/tokenizer.json

 backends/v2/src/client/pb
 backends/v3/src/client/pb
+backends/client/src/v2/pb
+backends/client/src/v3/pb

 # ROCm auto-generated files
 *.hip
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -20,7 +20,7 @@ default-members = [
 resolver = "2"

 [workspace.package]
-version = "2.3.2-dev0"
+version = "2.4.1-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/27
+++ b/27
@ -161,27 +161,6 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build

-# Build FBGEMM CUDA kernels
-FROM kernel-builder AS fbgemm-builder
-
-WORKDIR /usr/src
-
-COPY server/Makefile-fbgemm Makefile
-
-RUN make build-fbgemm
-
-# Build vllm CUDA kernels
-FROM kernel-builder AS vllm-builder
-
-WORKDIR /usr/src
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-
-COPY server/Makefile-vllm Makefile
-
-# Build specific version of vllm
-RUN make build-vllm-cuda
-
 # Build mamba kernels
 FROM kernel-builder AS mamba-builder
 WORKDIR /usr/src
@ -239,10 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from fbgemm builder
-COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
@ -258,7 +233,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3

 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/2
+++ b/2
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/27
+++ b/27
@ -83,7 +83,11 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list

-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
+RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" > /tmp/intel-for-pytorch-gpu-dev.list
+
+RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit=2024.2.1-98 xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9

 # Text Generation Inference base env
 ENV HF_HOME=/data \
@ -91,8 +95,15 @@ ENV HF_HOME=/data \
    PORT=80


+
 WORKDIR /usr/src
-RUN pip install torch==2.3.1+cxx11.abi torchvision==0.18.1+cxx11.abi torchaudio==2.3.1+cxx11.abi intel-extension-for-pytorch==2.3.110+xpu oneccl_bind_pt==2.3.100+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir
+RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
+
+RUN pip install triton-xpu==3.0.0b2 --no-cache-dir

 # Install server
 COPY proto proto
@ -101,19 +112,19 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
 ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
 ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
-ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/conda/lib
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV PATH=/opt/conda/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV CCL_ZE_IPC_EXCHANGE=sockets
 ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
 ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
-ENV TORCH_LLM_ALLREDUCE=1
-ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
+#ENV TORCH_LLM_ALLREDUCE=1
+#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
@ -186,7 +197,7 @@ RUN pip install triton py-libnuma

 WORKDIR /usr/src

-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout f86e93e4890dc2c989024d148d415c9aa8a1649f
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 2e1c98f74ec1b35ad8dd1ebe7dd4b25470f2fd41
 RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout v2.4.0+cpu+rc0

 RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
@ -207,7 +218,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/README.md
+++ b/README.md
@ -28,6 +28,7 @@ to power Hugging Chat, the Inference API and Inference Endpoint.
    - [Distributed Tracing](#distributed-tracing)
    - [Architecture](#architecture)
    - [Local install](#local-install)
+    - [Local install (Nix)](#local-install-nix)
  - [Optimized architectures](#optimized-architectures)
  - [Run locally](#run-locally)
    - [Run](#run)
@ -83,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model
 ```

 And then you can make requests like
@ -120,7 +121,7 @@ curl localhost:8080/v1/chat/completions \

 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0-rocm --model-id $model` instead of the command above.

 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@ -150,7 +151,7 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>

-docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model
 ```

 ### A note on Shared Memory (shm)
@ -236,6 +237,44 @@ text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 sudo apt-get install libssl-dev gcc -y
 ```

+### Local install (Nix)
+
+Another option is to install `text-generation-inference` locally using [Nix](https://nixos.org). Currently,
+we only support Nix on x86_64 Linux with CUDA GPUs. When using Nix, all dependencies can
+be pulled from a binary cache, removing the need to build them locally.
+
+First follow the instructions to [install Cachix and enable the TGI cache](https://app.cachix.org/cache/text-generation-inference).
+Setting up the cache is important, otherwise Nix will build many of the dependencies
+locally, which can take hours.
+
+After that you can run TGI with `nix run`:
+
+```shell
+nix run . -- --model-id meta-llama/Llama-3.1-8B-Instruct
+```
+
+**Note:** when you are using Nix on a non-NixOS system, you have to [make some symlinks](https://danieldk.eu/Nix-CUDA-on-non-NixOS-systems#make-runopengl-driverlib-and-symlink-the-driver-library)
+to make the CUDA driver libraries visible to Nix packages.
+
+For TGI development, you can use the `impure` dev shell:
+
+```shell
+nix develop .#impure
+
+# Only needed the first time the devshell is started or after updating the protobuf.
+(
+cd server
+mkdir text_generation_server/pb || true
+python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
+       --grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
+find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+touch text_generation_server/pb/__init__.py
+)
+```
+
+All development dependencies (cargo, Python, Torch), etc. are available in this
+dev shell.
+
 ## Optimized architectures

 TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
@ -107,20 +107,22 @@ impl Client {
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_tokens: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
-            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+            let mut truncate = max_prefill_tokens - n_tokens;
+            if let Some(max_input_tokens) = max_input_tokens {
+                truncate = min(max_input_tokens, truncate);
+            }

            let mut input_chunks = Vec::new();
-            input_chunks
-                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
+            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
            if n_tokens == 0 {
                input_chunks.push(
                    Chunk::Image(Image {
@ -136,7 +138,7 @@ impl Client {
            // been updated to support chunks.

            let mut inputs = String::new();
-            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
@ -145,6 +147,12 @@ impl Client {
                ));
            }

+            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
+                max_total_tokens - truncate
+            } else {
+                1
+            };
+
            requests.push(Request {
                id: 0,
                inputs,
@ -175,7 +183,7 @@ impl Client {
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: max_total_tokens - truncate,
+                    max_new_tokens,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
@ -183,7 +191,7 @@ impl Client {
                top_n_tokens: 20,
                adapter_id: None,
            });
-            n_tokens += max_input_length;
+            n_tokens += truncate;

            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
@ -195,19 +203,23 @@ impl Client {
            id: 0,
            size: requests.len() as u32,
            requests,
-            max_tokens: max_input_length,
+            max_tokens: max_input_tokens.unwrap_or(0),
            max_blocks: 0,
        };

        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
-            max_input_length,
+            max_input_tokens,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
-        Ok(response.max_supported_total_tokens)
+        Ok((
+            response.max_supported_total_tokens,
+            response.max_input_tokens,
+            response.max_total_tokens,
+        ))
    }

    /// Generate one token for each request in the given batch
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
@ -101,11 +101,11 @@ impl ShardedClient {
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_length: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
@ -122,8 +122,16 @@ impl ShardedClient {
        let results = join_all(futures)
            .await
            .into_iter()
-            .collect::<Result<Vec<Option<u32>>>>()?;
-        Ok(results.into_iter().flatten().min())
+            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;
+
+        // Take the minimum value
+        // Different shards hold different parts of vocab, might yield
+        // different available block size.
+        let min = results
+            .iter()
+            .min()
+            .expect("Expect at least 1 warmup result");
+        Ok(*min)
    }

    /// Generate one token for each request in the given batch
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@ -62,6 +62,8 @@ struct Args {
    executor_worker: PathBuf,
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,
+    #[clap(default_value = "2000000", long, env)]
+    payload_limit: usize,
 }

 async fn get_tokenizer(
@ -217,6 +219,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
        auth_token,
        executor_worker,
        usage_stats,
+        payload_limit,
    } = args;

    // Launch Tokio runtime
@ -287,6 +290,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
        tokenizer_name,
        tokenizer_config_path,
        revision,
+        false,
        hostname,
        port,
        cors_allow_origin,
@ -296,6 +300,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
        true,
        max_client_batch_size,
        usage_stats,
+        payload_limit,
    )
    .await?;
    Ok(())
--- a/backends/v2/src/main.rs
+++ b/backends/v2/src/main.rs
@ -70,6 +70,8 @@ struct Args {
    max_client_batch_size: usize,
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,
+    #[clap(default_value = "2000000", long, env)]
+    payload_limit: usize,
 }

 #[derive(Debug, Subcommand)]
@ -114,6 +116,7 @@ async fn main() -> Result<(), RouterError> {
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
+        payload_limit,
    } = args;

    if let Some(Commands::PrintSchema) = command {
@ -194,6 +197,7 @@ async fn main() -> Result<(), RouterError> {
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
+        payload_limit,
    )
    .await?;
    Ok(())
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -193,7 +193,7 @@ pub(crate) async fn batching_task(
                };

                // Try to get a new batch
-                if let Some((new_entries, new_batch, span)) = queue
+                if let Some((mut new_entries, new_batch, span)) = queue
                    .next_batch(min_size, max_size, prefill_token_budget, token_budget)
                    .await
                {
@ -209,11 +209,26 @@ pub(crate) async fn batching_task(
                        };
                        counter.increment(1);
                    }
-                    let cached_batch = if support_chunking {
-                        // Concat current batch to the new one
-                        batches.pop()
+
+                    let new_cached_batch = if support_chunking {
+                        // Get cached batch
+                        let cached_batch = batches.pop();
+                        // Extend entries with the new entries since the batch will be
+                        // concatenated during the prefill op server side
+                        entries.extend(new_entries);
+                        // Generate one token for both the cached batch and the new batch
+                        let new_cached_batch =
+                            prefill(&mut client, new_batch, cached_batch, &mut entries)
+                                .instrument(span)
+                                .await;
+                        if new_cached_batch.is_none() {
+                            // New cached batch is empty, no work left
+                            break;
+                        }
+                        new_cached_batch
                    } else {
-                        // Request are waiting only if we don't support chunking
+                        // Request are waiting because we cannot concatenate the batches if the
+                        // model/server does not support chunking
                        entries.iter_mut().for_each(|(_, entry)| {
                            // Create a new span to add the info that this entry is waiting
                            // because a new batch is being computed
@ -224,23 +239,24 @@ pub(crate) async fn batching_task(
                            // Update entry
                            entry.temp_span = Some(entry_waiting_span);
                        });
-                        None
-                    };
-                    entries.extend(new_entries);

                        // Generate one token for this new batch to have the attention past in cache
                        let new_cached_batch =
-                        prefill(&mut client, new_batch, cached_batch, &mut entries)
+                            prefill(&mut client, new_batch, None, &mut new_entries)
                                .instrument(span)
                                .await;
+                        if new_cached_batch.is_some() {
+                            // Extend entries
+                            entries.extend(new_entries);
+                        }
+                        new_cached_batch
+                    };
+
                    // Reset waiting counter
                    waiting_tokens = 1;
                    // Extend current batch with the new batch
                    if let Some(new_cached_batch) = new_cached_batch {
                        batches.push(new_cached_batch);
-                    } else if support_chunking {
-                        // New cached batch is empty, no work left
-                        break;
                    }
                }

--- a/backends/v3/src/client/grpc_client.rs
+++ b/backends/v3/src/client/grpc_client.rs
@ -108,20 +108,22 @@ impl Client {
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_tokens: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
-            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+            let mut truncate = max_prefill_tokens - n_tokens;
+            if let Some(max_input_tokens) = max_input_tokens {
+                truncate = min(max_input_tokens, truncate);
+            }

            let mut input_chunks = Vec::new();
-            input_chunks
-                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
+            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
            if n_tokens == 0 {
                input_chunks.push(
                    Chunk::Image(Image {
@ -137,7 +139,7 @@ impl Client {
            // been updated to support chunks.

            let mut inputs = String::new();
-            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
@ -146,6 +148,12 @@ impl Client {
                ));
            }

+            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
+                max_total_tokens - truncate
+            } else {
+                1
+            };
+
            requests.push(Request {
                id: 0,
                inputs,
@ -175,7 +183,7 @@ impl Client {
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: max_total_tokens - truncate,
+                    max_new_tokens,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
@ -183,7 +191,7 @@ impl Client {
                top_n_tokens: 20,
                adapter_id: None,
            });
-            n_tokens += max_input_length;
+            n_tokens += truncate;

            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
@ -195,19 +203,23 @@ impl Client {
            id: 0,
            size: requests.len() as u32,
            requests,
-            max_tokens: max_input_length,
+            max_tokens: max_input_tokens.unwrap_or(0),
            max_blocks: 0,
        };

        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
-            max_input_length,
+            max_input_tokens,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
-        Ok(response.max_supported_total_tokens)
+        Ok((
+            response.max_supported_total_tokens,
+            response.max_input_tokens,
+            response.max_total_tokens,
+        ))
    }

    /// Generate one token for each request in the given batch
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -102,11 +102,11 @@ impl ShardedClient {
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_length: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
@ -119,12 +119,19 @@ impl ShardedClient {
                ))
            })
            .collect();
-        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
-            .collect::<Result<Vec<Option<u32>>>>()?;
-        Ok(results.into_iter().flatten().min())
+            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;
+
+        // Take the minimum value
+        // Different shards hold different parts of vocab, might yield
+        // different available block size.
+        let min = results
+            .iter()
+            .min()
+            .expect("Expect at least 1 warmup result");
+        Ok(*min)
    }

    /// Generate one token for each request in the given batch
--- a/backends/v3/src/lib.rs
+++ b/backends/v3/src/lib.rs
@ -37,12 +37,17 @@ pub struct BackendInfo {
    pub attention_impl: String,
    #[schema(example = "1")]
    pub block_size: u32,
+
+    #[schema(example = "30000")]
+    pub max_input_tokens: usize,
+    #[schema(example = "32000")]
+    pub max_total_tokens: usize,
 }

 #[allow(clippy::too_many_arguments)]
 pub async fn connect_backend(
-    max_input_tokens: usize,
-    max_total_tokens: usize,
+    max_input_tokens: Option<usize>,
+    max_total_tokens: Option<usize>,
    master_shard_uds_path: String,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
@ -51,14 +56,32 @@ pub async fn connect_backend(
    max_batch_size: Option<usize>,
 ) -> Result<(BackendV3, BackendInfo), V3Error> {
    // Helper function
-    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+    let check_max_batch_total_tokens = |(
+        max_supported_batch_total_tokens,
+        shard_max_input_tokens,
+        shard_max_total_tokens,
+    ): (Option<u32>, u32, u32)|
+     -> Result<(u32, usize, usize), V3Error> {
+        if let Some(max_input_tokens) = max_input_tokens {
+            assert_eq!(max_input_tokens as u32, shard_max_input_tokens);
+        }
+        if let Some(max_total_tokens) = max_total_tokens {
+            assert_eq!(max_total_tokens as u32, shard_max_total_tokens);
+        }
        match max_supported_batch_total_tokens {
            // Older models do not support automatic max-batch-total-tokens
            None => {
-                let max_batch_total_tokens = max_batch_total_tokens
-                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
+                let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
+                    16000
+                        .max(shard_max_total_tokens)
+                        .max(max_batch_prefill_tokens),
+                );
                tracing::warn!("Model does not support automatic max batch total tokens");
-                Ok(max_batch_total_tokens)
+                Ok((
+                    max_batch_total_tokens,
+                    shard_max_input_tokens as usize,
+                    shard_max_total_tokens as usize,
+                ))
            }
            // Flash attention models return their max supported total tokens
            Some(max_supported_batch_total_tokens) => {
@ -72,11 +95,15 @@ pub async fn connect_backend(
                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
                    );
                }
-                if max_total_tokens as u32 > max_supported_batch_total_tokens {
-                    return Err(V3Error::NotEnoughMemory(max_total_tokens));
+                if shard_max_total_tokens > max_supported_batch_total_tokens {
+                    return Err(V3Error::NotEnoughMemory(shard_max_total_tokens as usize));
                }

-                Ok(max_supported_batch_total_tokens)
+                Ok((
+                    max_supported_batch_total_tokens,
+                    shard_max_input_tokens as usize,
+                    shard_max_total_tokens as usize,
+                ))
            }
        }
    };
@ -96,23 +123,25 @@ pub async fn connect_backend(

    // Warmup model
    tracing::info!("Warming up model");
-    let max_batch_total_tokens = check_max_batch_total_tokens(
-        sharded_client
+    let answer = sharded_client
        .warmup(
-                max_input_tokens as u32,
+            max_input_tokens.map(|p| p as u32),
            max_batch_prefill_tokens,
-                max_total_tokens as u32,
+            max_total_tokens.map(|p| p as u32),
            max_batch_size,
        )
        .await
-            .map_err(V3Error::Warmup)?,
-    )?;
+        .map_err(V3Error::Warmup)?;
+    let (max_batch_total_tokens, max_input_tokens, max_total_tokens) =
+        check_max_batch_total_tokens(answer)?;
    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
    metrics::gauge!("tgi_batch_max_total_tokens").set(max_batch_total_tokens);

    let backend_info = BackendInfo {
        waiting_served_ratio,
        max_batch_total_tokens,
+        max_input_tokens,
+        max_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        model_device_type: shard_info.device_type.clone(),
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@ -18,10 +18,10 @@ struct Args {
    max_stop_sequences: usize,
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,
-    #[clap(default_value = "1024", long, env)]
-    max_input_tokens: usize,
-    #[clap(default_value = "2048", long, env)]
-    max_total_tokens: usize,
+    #[clap(long, env)]
+    max_input_tokens: Option<usize>,
+    #[clap(long, env)]
+    max_total_tokens: Option<usize>,
    #[clap(default_value = "1.2", long, env)]
    waiting_served_ratio: f32,
    #[clap(default_value = "4096", long, env)]
@ -70,6 +70,8 @@ struct Args {
    max_client_batch_size: usize,
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,
+    #[clap(default_value = "2000000", long, env)]
+    payload_limit: usize,
 }

 #[derive(Debug, Subcommand)]
@ -114,6 +116,7 @@ async fn main() -> Result<(), RouterError> {
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
+        payload_limit,
    } = args;

    if let Some(Commands::PrintSchema) = command {
@ -126,12 +129,6 @@ async fn main() -> Result<(), RouterError> {
    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);

    // Validate args
-    if max_input_tokens >= max_total_tokens {
-        return Err(RouterError::ArgumentValidation(
-            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
-        ));
-    }
-
    if validation_workers == 0 {
        return Err(RouterError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
@ -160,6 +157,28 @@ async fn main() -> Result<(), RouterError> {
    // Validate remaining args now that the backend is known
    let support_chunking = backend_info.support_chunking;
    let max_batch_total_tokens = backend_info.max_batch_total_tokens;
+
+    if max_input_tokens.is_none() {
+        tracing::info!(
+            "Maximum input tokens defaulted to {}",
+            backend_info.max_input_tokens
+        );
+    }
+    if max_total_tokens.is_none() {
+        tracing::info!(
+            "Maximum total tokens defaulted to {}",
+            backend_info.max_total_tokens
+        );
+    }
+
+    let max_input_tokens = backend_info.max_input_tokens;
+    let max_total_tokens = backend_info.max_total_tokens;
+    if max_input_tokens >= max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+
    if max_input_tokens as u32 > max_batch_prefill_tokens && !support_chunking {
        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
    }
@ -194,6 +213,7 @@ async fn main() -> Result<(), RouterError> {
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
+        payload_limit,
    )
    .await?;
    Ok(())
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -180,7 +180,7 @@ async fn prefill(
    let latency = start_time.elapsed();

    // Compute throughput from latency and batch size
-    let throughput = batch_size as f64 / latency.as_secs_f64();
+    let throughput = (batch_size * sequence_length) as f64 / latency.as_secs_f64();

    // Decode batch cannot be empty
    let decode_batch = decode_batch.expect("decode_batch is None. This is a bug.");
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "2.3.2-dev0"
+    "version": "2.4.1-dev0"
  },
  "paths": {
    "/": {
@ -36,8 +36,11 @@
            "content": {
              "application/json": {
                "schema": {
+                  "type": "array",
+                  "items": {
                    "$ref": "#/components/schemas/GenerateResponse"
                  }
+                }
              },
              "text/event-stream": {
                "schema": {
@ -101,6 +104,47 @@
        }
      }
    },
+    "/chat_tokenize": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Template and tokenize ChatRequest",
+        "operationId": "get_chat_tokenize",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/ChatRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Templated and tokenized ChatRequest",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatTokenizeResponse"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "Failed to tokenize ChatRequest",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
    "/generate": {
      "post": {
        "tags": [
@ -950,13 +994,6 @@
            "example": "1.0",
            "nullable": true
          },
-          "guideline": {
-            "type": "string",
-            "description": "A guideline to be used in the chat_template",
-            "default": "null",
-            "example": "null",
-            "nullable": true
-          },
          "logit_bias": {
            "type": "array",
            "items": {
@ -1058,6 +1095,7 @@
                "$ref": "#/components/schemas/ToolChoice"
              }
            ],
+            "default": "auto",
            "nullable": true
          },
          "tool_prompt": {
@ -1092,6 +1130,21 @@
          }
        }
      },
+      "ChatTokenizeResponse": {
+        "type": "object",
+        "required": [
+          "tokenize_response",
+          "templated_text"
+        ],
+        "properties": {
+          "templated_text": {
+            "type": "string"
+          },
+          "tokenize_response": {
+            "$ref": "#/components/schemas/TokenizeResponse"
+          }
+        }
+      },
      "Chunk": {
        "type": "object",
        "required": [
@ -2235,14 +2288,6 @@
        }
      },
      "ToolChoice": {
-        "allOf": [
-          {
-            "$ref": "#/components/schemas/ToolType"
-          }
-        ],
-        "nullable": true
-      },
-      "ToolType": {
        "oneOf": [
          {
            "type": "string",
@ -2258,6 +2303,13 @@
              "none"
            ]
          },
+          {
+            "type": "string",
+            "description": "Means the model must call one or more tools.",
+            "enum": [
+              "required"
+            ]
+          },
          {
            "type": "object",
            "required": [
@ -2270,8 +2322,7 @@
            }
          }
        ],
-        "description": "Controls which (if any) tool is called by the model.",
-        "example": "auto"
+        "description": "<https://platform.openai.com/docs/guides/function-calling/configuring-function-calling-behavior-using-the-tool_choice-parameter>"
      },
      "Url": {
        "type": "object",
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@ -315,8 +315,6 @@ print(chat.choices[0].message.tool_calls)

 TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.

-However there are some minor differences in the API, for example `tool_choice="auto"` will ALWAYS choose the tool for you. This is different from OpenAI's API where `tool_choice="auto"` will choose a tool if the model thinks it's necessary.
-
 ```python
 from openai import OpenAI

@ -362,3 +360,61 @@ print(called)
 #     },
 # }
 ```
+
+### Tool Choice Configuration
+
+When configuring how the model interacts with tools during a chat completion, there are several options for determining if or how a tool should be called. These options are controlled by the `tool_choice` parameter, which specifies the behavior of the model in relation to tool usage. The following modes are supported:
+
+1. **`auto`**:
+
+   - The model decides whether to call a tool or generate a response message based on the user's input.
+   - If tools are provided, this is the default mode.
+   - Example usage:
+     ```python
+     tool_choice="auto"
+     ```
+
+2. **`none`**:
+
+   - The model will never call any tools and will only generate a response message.
+   - If no tools are provided, this is the default mode.
+   - Example usage:
+     ```python
+     tool_choice="none"
+     ```
+
+3. **`required`**:
+
+   - The model must call one or more tools and will not generate a response message on its own.
+   - Example usage:
+     ```python
+     tool_choice="required"
+     ```
+
+4. **Specific Tool Call by Function Name**:
+   - You can force the model to call a specific tool either by specifying the tool function directly or by using an object definition.
+   - Two ways to do this:
+     1. Provide the function name as a string:
+        ```python
+        tool_choice="get_current_weather"
+        ```
+     2. Use the function object format:
+        ```python
+        tool_choice={
+          "type": "function",
+          "function": {
+              "name": "get_current_weather"
+          }
+        }
+        ```
+
+These options allow flexibility when integrating tools with the chat completions endpoint. You can configure the model to either rely on tools automatically or force it to follow a predefined behavior, based on the needs of the task at hand.
+
+---
+
+| **Tool Choice Option**                | **Description**                                                                                                                 | **When to Use**                                                                        |
+| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------- |
+| `auto`                                | The model decides whether to call a tool or generate a message. This is the default if tools are provided.                      | Use when you want the model to decide when a tool is necessary.                        |
+| `none`                                | The model generates a message without calling any tools. This is the default if no tools are provided.                          | Use when you do not want the model to call any tools.                                  |
+| `required`                            | The model must call one or more tools and will not generate a message on its own.                                               | Use when a tool call is mandatory, and you do not want a regular message generated.    |
+| Specific Tool Call (`name` or object) | Force the model to call a specific tool either by specifying its name (`tool_choice="get_current_weather"`) or using an object. | Use when you want to restrict the model to calling a particular tool for the response. |
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes
 ```

 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes-nf4
 ```

 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize gptq
 ```

 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-rocm \
    --model-id $model
 ```

--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-xpu \
    --model-id $model --cuda-graphs 0
 ```

@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu \
    --model-id $model --cuda-graphs 0
 ```

--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```

--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```

@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:2.3.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.4.0 --help
 ```

 </Tip>
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@ -163,7 +163,7 @@ hub = {

 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.2"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.4.0"),
 env=hub,
 role=role,
 )
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@ -63,6 +63,7 @@ Options:

          Possible values:
          - awq:                4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
          - eetq:               8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
          - exl2:               Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
          - gptq:               4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
@ -146,7 +147,7 @@ Options:
 ## MAX_INPUT_TOKENS
 ```shell
      --max-input-tokens <MAX_INPUT_TOKENS>
-          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_allocatable, max_position_embeddings) - 1
          
          [env: MAX_INPUT_TOKENS=]

@ -162,7 +163,7 @@ Options:
 ## MAX_TOTAL_TOKENS
 ```shell
      --max-total-tokens <MAX_TOTAL_TOKENS>
-          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_allocatable, max_position_embeddings)
          
          [env: MAX_TOTAL_TOKENS=]

@ -455,6 +456,17 @@ Options:
          - off:      Disables all collection of usage statistics
          - no-stack: Doesn't send the error stack trace or error type, but allows sending a crash event

+```
+## PAYLOAD_LIMIT
+```shell
+      --payload-limit <PAYLOAD_LIMIT>
+          Payload size limit in bytes
+          
+          Default is 2MB
+          
+          [env: PAYLOAD_LIMIT=]
+          [default: 2000000]
+
 ```
 ## HELP
 ```shell
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -24,6 +24,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
 - [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
 - [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
+- [Qwen 2 VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
 - [Opt](https://huggingface.co/facebook/opt-6.7b)
 - [T5](https://huggingface.co/google/flan-t5-xxl)
 - [Galactica](https://huggingface.co/facebook/galactica-120b)
--- a/flake.lock
+++ b/flake.lock
@ -108,11 +108,11 @@
        "pre-commit-hooks": "pre-commit-hooks_3"
      },
      "locked": {
-        "lastModified": 1723311214,
-        "narHash": "sha256-xdGZQBEa1AC2us/sY3igS/CucWY6jErXsAvCFRhB2LI=",
+        "lastModified": 1730277369,
+        "narHash": "sha256-yvQbeJbnnwCB68yv7uZXdGb+P7NMn5JMGBw0aBHymDI=",
        "owner": "nix-community",
        "repo": "crate2nix",
-        "rev": "236f6addfd452a48be805819e3216af79e988fd5",
+        "rev": "151122427d030874ebef3517cda766a6984e6ed6",
        "type": "github"
      },
      "original": {
@ -497,11 +497,11 @@
        "systems": "systems_7"
      },
      "locked": {
-        "lastModified": 1726560853,
-        "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
        "type": "github"
      },
      "original": {
@ -581,11 +581,11 @@
    },
    "nix-filter": {
      "locked": {
-        "lastModified": 1710156097,
-        "narHash": "sha256-1Wvk8UP7PXdf8bCCaEoMnOT1qe5/Duqgj+rL8sRQsSM=",
+        "lastModified": 1730207686,
+        "narHash": "sha256-SCHiL+1f7q9TAnxpasriP6fMarWE5H43t25F5/9e28I=",
        "owner": "numtide",
        "repo": "nix-filter",
-        "rev": "3342559a24e85fc164b295c3444e8a139924675b",
+        "rev": "776e68c1d014c3adde193a18db9d738458cd2ba4",
        "type": "github"
      },
      "original": {
@ -718,16 +718,16 @@
    },
    "nixpkgs_6": {
      "locked": {
-        "lastModified": 1727675176,
-        "narHash": "sha256-xIjBFMYldWvj+g8ahxMPofsj+OqxvKJN6YylNHQ7gn4=",
-        "owner": "nixos",
+        "lastModified": 1732034459,
+        "narHash": "sha256-Zais/zMRuJdlALidkUgEuasXOd37ZZLqkPkF9bIYSrY=",
+        "owner": "danieldk",
        "repo": "nixpkgs",
-        "rev": "a6d0207fea9212d28cd3d487efe6bc699663b93a",
+        "rev": "40280e7bf9743cdf563494db4ece2a43aa674fa8",
        "type": "github"
      },
      "original": {
-        "owner": "nixos",
-        "ref": "nixos-unstable-small",
+        "owner": "danieldk",
+        "ref": "outlines-v0.1.4-tgi",
        "repo": "nixpkgs",
        "type": "github"
      }
@ -853,11 +853,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1727836133,
-        "narHash": "sha256-JE0zciM5IGWvK8J/pE2VldNBf7oyMH5WrU8tZArefbg=",
+        "lastModified": 1730687492,
+        "narHash": "sha256-xQVadjquBA/tFxDt5A55LJ1D1AvkVWsnrKC2o+pr8F4=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "02321540b0c8000b36889b1b974d1fec585b25a4",
+        "rev": "41814763a2c597755b0755dbe3e721367a5e420f",
        "type": "github"
      },
      "original": {
@ -978,16 +978,15 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1729531056,
-        "narHash": "sha256-dW9IOA31+j3VS19WAWAmkJW2YCzeVZGqd6HpIJfODtI=",
+        "lastModified": 1732187990,
+        "narHash": "sha256-93xEH3aUs6+D5Kab9DGBUX9vrEpwhm839wdp2yCg9hI=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "a84a90281a17b15762873845c947e5c78f5a8dd1",
+        "rev": "f25a1cd889a6ae49c1e204232500005f82241a8b",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
-        "ref": "marlin-kernels-0.3.0",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,3 +1,15 @@
+# ruff: noqa: E402
+import requests
+
+
+class SessionTimeoutFix(requests.Session):
+    def request(self, *args, **kwargs):
+        timeout = kwargs.pop("timeout", 120)
+        return super().request(*args, **kwargs, timeout=timeout)
+
+
+requests.sessions.Session = SessionTimeoutFix
+
 import asyncio
 import contextlib
 import json
@ -557,7 +569,7 @@ def launcher(event_loop):
            devices=devices,
            volumes=volumes,
            ports={"80/tcp": port},
-            healthcheck={"timeout": int(10 * 1e9)},
+            healthcheck={"timeout": int(60 * 1e9), "retries": 2},  # 60s
            shm_size="1G",
        )

--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -6.3867188,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -1.1318359,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -9.6875,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -1.3007812,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -2.4902344,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 323,
+        "logprob": -1.1171875,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 1268,
+        "logprob": -0.9477539,
+        "special": false,
+        "text": " how"
+      },
+      {
+        "id": 1587,
+        "logprob": -0.51464844,
+        "special": false,
+        "text": " does"
+      },
+      {
+        "id": 433,
+        "logprob": -0.043182373,
+        "special": false,
+        "text": " it"
+      },
+      {
+        "id": 1782,
+        "logprob": -1.0810547,
+        "special": false,
+        "text": " differ"
+      },
+      {
+        "id": 505,
+        "logprob": -0.005054474,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 8776,
+        "logprob": -0.47485352,
+        "special": false,
+        "text": " traditional"
+      },
+      {
+        "id": 5780,
+        "logprob": -0.15112305,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.0011291504,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 5380,
+        "logprob": -0.31323242,
+        "special": false,
+        "text": "?\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " and how does it differ from traditional machine learning?\n"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -6.3867188,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -1.1318359,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -9.6875,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -1.3007812,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5380,
+        "logprob": 0.0,
+        "special": false,
+        "text": "?\n"
+      },
+      {
+        "id": 34564,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 11,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1101,
+        "logprob": -1.0947266,
+        "special": false,
+        "text": " also"
+      },
+      {
+        "id": 3967,
+        "logprob": 0.0,
+        "special": false,
+        "text": " known"
+      },
+      {
+        "id": 439,
+        "logprob": 0.0,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 30828,
+        "logprob": 0.0,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 4009,
+        "logprob": -0.15563965,
+        "special": false,
+        "text": " network"
+      },
+      {
+        "id": 477,
+        "logprob": -1.4003906,
+        "special": false,
+        "text": " or"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.3867188,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -1.1318359,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.6875,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.3007812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.4902344,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 323,
+          "logprob": -1.1171875,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 1268,
+          "logprob": -0.9477539,
+          "special": false,
+          "text": " how"
+        },
+        {
+          "id": 1587,
+          "logprob": -0.51464844,
+          "special": false,
+          "text": " does"
+        },
+        {
+          "id": 433,
+          "logprob": -0.043182373,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 1782,
+          "logprob": -1.0810547,
+          "special": false,
+          "text": " differ"
+        },
+        {
+          "id": 505,
+          "logprob": -0.005054474,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 8776,
+          "logprob": -0.47485352,
+          "special": false,
+          "text": " traditional"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.15112305,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0011291504,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 5380,
+          "logprob": -0.3173828,
+          "special": false,
+          "text": "?\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " and how does it differ from traditional machine learning?\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.3867188,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -1.1318359,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.6875,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.3007812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.4902344,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 323,
+          "logprob": -1.1171875,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 1268,
+          "logprob": -0.9477539,
+          "special": false,
+          "text": " how"
+        },
+        {
+          "id": 1587,
+          "logprob": -0.51464844,
+          "special": false,
+          "text": " does"
+        },
+        {
+          "id": 433,
+          "logprob": -0.043182373,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 1782,
+          "logprob": -1.0810547,
+          "special": false,
+          "text": " differ"
+        },
+        {
+          "id": 505,
+          "logprob": -0.005054474,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 8776,
+          "logprob": -0.47485352,
+          "special": false,
+          "text": " traditional"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.15112305,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0011291504,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 5380,
+          "logprob": -0.3173828,
+          "special": false,
+          "text": "?\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " and how does it differ from traditional machine learning?\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.3867188,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -1.1318359,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.6875,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.3007812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.4902344,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 323,
+          "logprob": -1.1171875,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 1268,
+          "logprob": -0.9477539,
+          "special": false,
+          "text": " how"
+        },
+        {
+          "id": 1587,
+          "logprob": -0.51464844,
+          "special": false,
+          "text": " does"
+        },
+        {
+          "id": 433,
+          "logprob": -0.043182373,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 1782,
+          "logprob": -1.0810547,
+          "special": false,
+          "text": " differ"
+        },
+        {
+          "id": 505,
+          "logprob": -0.005054474,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 8776,
+          "logprob": -0.47485352,
+          "special": false,
+          "text": " traditional"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.15112305,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0011291504,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 5380,
+          "logprob": -0.3173828,
+          "special": false,
+          "text": "?\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " and how does it differ from traditional machine learning?\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -6.3867188,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -1.1318359,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -9.6875,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.3007812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.4902344,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 323,
+          "logprob": -1.1171875,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 1268,
+          "logprob": -0.9477539,
+          "special": false,
+          "text": " how"
+        },
+        {
+          "id": 1587,
+          "logprob": -0.51464844,
+          "special": false,
+          "text": " does"
+        },
+        {
+          "id": 433,
+          "logprob": -0.043182373,
+          "special": false,
+          "text": " it"
+        },
+        {
+          "id": 1782,
+          "logprob": -1.0810547,
+          "special": false,
+          "text": " differ"
+        },
+        {
+          "id": 505,
+          "logprob": -0.005054474,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 8776,
+          "logprob": -0.47485352,
+          "special": false,
+          "text": " traditional"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.15112305,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0011291504,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 5380,
+          "logprob": -0.3173828,
+          "special": false,
+          "text": "?\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " and how does it differ from traditional machine learning?\n"
+  }
+]
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 3838,
+        "logprob": null,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -8.59375,
+        "text": " is"
+      },
+      {
+        "id": 5538,
+        "logprob": -10.921875,
+        "text": " deep"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.56347656,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -1.5,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 18183,
+        "logprob": -1.6669922,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.08959961,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -0.14685059,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.125,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 25993,
+        "logprob": -0.81640625,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 315,
+        "logprob": -0.0013418198,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5662,
+        "logprob": -0.16027832,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.0016393661,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 429,
+        "logprob": -0.4477539,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 5711,
+        "logprob": -1.2802734,
+        "special": false,
+        "text": " uses"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " Deep learning is a subset of machine learning that uses"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@ -0,0 +1,94 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 3838,
+        "logprob": null,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -8.59375,
+        "text": " is"
+      },
+      {
+        "id": 5538,
+        "logprob": -10.921875,
+        "text": " deep"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.56347656,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 1939,
+        "logprob": -2.2675781,
+        "special": false,
+        "text": "?\n\n"
+      },
+      {
+        "id": 33464,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 20909,
+        "logprob": -0.37695312,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 4102,
+        "logprob": -1.9316406,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 285,
+        "logprob": 0.0,
+        "special": false,
+        "text": "is"
+      },
+      {
+        "id": 458,
+        "logprob": -0.80859375,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 3082,
+        "logprob": -1.4541016,
+        "special": false,
+        "text": " area"
+      },
+      {
+        "id": 315,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 20443,
+        "logprob": -0.5136719,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11229,
+        "logprob": 0.0,
+        "special": false,
+        "text": " intelligence"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\n\nDeep Learning is an area of artificial intelligence"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@ -0,0 +1,398 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 3838,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -8.59375,
+          "text": " is"
+        },
+        {
+          "id": 5538,
+          "logprob": -10.921875,
+          "text": " deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.56347656,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.5,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18183,
+          "logprob": -1.6669922,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.08959961,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.14685059,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 25993,
+          "logprob": -0.81640625,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0013418198,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5662,
+          "logprob": -0.16259766,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.0016393661,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 429,
+          "logprob": -0.4477539,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5711,
+          "logprob": -1.2802734,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that uses"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 3838,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -8.59375,
+          "text": " is"
+        },
+        {
+          "id": 5538,
+          "logprob": -10.921875,
+          "text": " deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.56347656,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.5,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18183,
+          "logprob": -1.6669922,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.08959961,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.14685059,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 25993,
+          "logprob": -0.81640625,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0013418198,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5662,
+          "logprob": -0.16259766,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.0016393661,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 429,
+          "logprob": -0.4477539,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5711,
+          "logprob": -1.2802734,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that uses"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 3838,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -8.59375,
+          "text": " is"
+        },
+        {
+          "id": 5538,
+          "logprob": -10.921875,
+          "text": " deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.56347656,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.5,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18183,
+          "logprob": -1.6669922,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.08959961,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.14685059,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 25993,
+          "logprob": -0.81640625,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0013418198,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5662,
+          "logprob": -0.16259766,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.0016393661,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 429,
+          "logprob": -0.4477539,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5711,
+          "logprob": -1.2802734,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that uses"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 3838,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -8.59375,
+          "text": " is"
+        },
+        {
+          "id": 5538,
+          "logprob": -10.921875,
+          "text": " deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.56347656,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -1.5,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18183,
+          "logprob": -1.6669922,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.08959961,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.14685059,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 25993,
+          "logprob": -0.81640625,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.0013418198,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5662,
+          "logprob": -0.16259766,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6832,
+          "logprob": -0.0016393661,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 429,
+          "logprob": -0.4477539,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5711,
+          "logprob": -1.2802734,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a subset of machine learning that uses"
+  }
+]
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.609375,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.92529297,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -10.0,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.94628906,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -2.9042969,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 18682,
+        "logprob": -0.8769531,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.0076942444,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -0.25073242,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.097595215,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 955,
+        "logprob": -0.921875,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00027918816,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 21075,
+        "logprob": -0.5527344,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11478,
+        "logprob": -0.042541504,
+        "special": false,
+        "text": " intelligence"
+      },
+      {
+        "id": 320,
+        "logprob": -0.38891602,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 15836,
+        "logprob": -0.0011043549,
+        "special": false,
+        "text": "AI"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " Deep learning is a type of artificial intelligence (AI"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.609375,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.92529297,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -10.0,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.94628906,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5380,
+        "logprob": -0.23840332,
+        "special": false,
+        "text": "?\n"
+      },
+      {
+        "id": 34564,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 11,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1101,
+        "logprob": -1.2011719,
+        "special": false,
+        "text": " also"
+      },
+      {
+        "id": 3967,
+        "logprob": 0.0,
+        "special": false,
+        "text": " known"
+      },
+      {
+        "id": 439,
+        "logprob": 0.0,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 30828,
+        "logprob": 0.0,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 4009,
+        "logprob": -0.6777344,
+        "special": false,
+        "text": " network"
+      },
+      {
+        "id": 477,
+        "logprob": 0.0,
+        "special": false,
+        "text": " or"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.609375,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92529297,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94628906,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.9042969,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.8769531,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0076942444,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25146484,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.097595215,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.9248047,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027513504,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.043151855,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011043549,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.6054688,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.92089844,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -10.0,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.94433594,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.90625,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 18682,
+          "logprob": -0.875,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.007698059,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.25268555,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.09753418,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 955,
+          "logprob": -0.92529297,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 315,
+          "logprob": -0.00027942657,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 21075,
+          "logprob": -0.5527344,
+          "special": false,
+          "text": " artificial"
+        },
+        {
+          "id": 11478,
+          "logprob": -0.042541504,
+          "special": false,
+          "text": " intelligence"
+        },
+        {
+          "id": 320,
+          "logprob": -0.3840332,
+          "special": false,
+          "text": " ("
+        },
+        {
+          "id": 15836,
+          "logprob": -0.0011053085,
+          "special": false,
+          "text": "AI"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " Deep learning is a type of artificial intelligence (AI"
+  }
+]
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 1841,
+        "logprob": -5.46875,
+        "text": "What"
+      },
+      {
+        "id": 603,
+        "logprob": -0.69140625,
+        "text": " is"
+      },
+      {
+        "id": 5271,
+        "logprob": -12.0,
+        "text": " deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.32226562,
+        "text": " learning"
+      },
+      {
+        "id": 235336,
+        "logprob": -0.33203125,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 109,
+        "logprob": -0.24707031,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 26843,
+        "logprob": -0.14550781,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.038330078,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 603,
+        "logprob": -0.029907227,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 476,
+        "logprob": -0.020996094,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 38397,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 576,
+        "logprob": -0.00049209595,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 6479,
+        "logprob": -0.057373047,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.000207901,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 674,
+        "logprob": -0.15429688,
+        "special": false,
+        "text": " that"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a subset of machine learning that"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 1841,
+        "logprob": -5.46875,
+        "text": "What"
+      },
+      {
+        "id": 603,
+        "logprob": -0.69140625,
+        "text": " is"
+      },
+      {
+        "id": 5271,
+        "logprob": -12.0,
+        "text": " deep"
+      },
+      {
+        "id": 6044,
+        "logprob": -0.32226562,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 235336,
+        "logprob": 0.0,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 109,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 26843,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 14715,
+        "logprob": -0.38671875,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 603,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 476,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 38397,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 576,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 6479,
+        "logprob": 0.0,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6044,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.69140625,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.32226562,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.33203125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.24707031,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.03857422,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.828125,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.00051498413,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05883789,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020694733,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.15820312,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 1841,
+          "logprob": -5.46875,
+          "text": "What"
+        },
+        {
+          "id": 603,
+          "logprob": -0.71484375,
+          "text": " is"
+        },
+        {
+          "id": 5271,
+          "logprob": -12.0,
+          "text": " deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.30859375,
+          "text": " learning"
+        },
+        {
+          "id": 235336,
+          "logprob": -0.3359375,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 109,
+          "logprob": -0.23828125,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 26843,
+          "logprob": -0.14550781,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.038330078,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 603,
+          "logprob": -0.030883789,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 476,
+          "logprob": -0.020996094,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 38397,
+          "logprob": -0.80859375,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 576,
+          "logprob": -0.0005455017,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 6479,
+          "logprob": -0.05908203,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6044,
+          "logprob": -0.00020599365,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 674,
+          "logprob": -0.17285156,
+          "special": false,
+          "text": " that"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a subset of machine learning that"
+  }
+]
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.5390625,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.86035156,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -8.828125,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -1.4912109,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -2.1152344,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 34564,
+        "logprob": -1.765625,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.023864746,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -0.1060791,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.1940918,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 27084,
+        "logprob": -0.79785156,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 315,
+        "logprob": -0.008262634,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5780,
+        "logprob": -0.046569824,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.0023479462,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 430,
+        "logprob": -0.7626953,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 5829,
+        "logprob": -1.0107422,
+        "special": false,
+        "text": " uses"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Deep learning is a subset of machine learning that uses"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_all_params.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 3923,
+        "logprob": -7.5390625,
+        "text": "What"
+      },
+      {
+        "id": 374,
+        "logprob": -0.86035156,
+        "text": " is"
+      },
+      {
+        "id": 5655,
+        "logprob": -8.828125,
+        "text": " deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -1.4912109,
+        "text": " learning"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5380,
+        "logprob": 0.0,
+        "special": false,
+        "text": "?\n"
+      },
+      {
+        "id": 34564,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 320,
+        "logprob": -0.19580078,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 16931,
+        "logprob": -1.7783203,
+        "special": false,
+        "text": "DL"
+      },
+      {
+        "id": 8,
+        "logprob": 0.0,
+        "special": false,
+        "text": ")"
+      },
+      {
+        "id": 374,
+        "logprob": -1.4287109,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 27084,
+        "logprob": 0.0,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 315,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is deep learning?\nDeep learning (DL) is a subset of"
+}
--- a/integration-tests/models/snapshots/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_load.json
+++ b/integration-tests/models/snapshots/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.5390625,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.86035156,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -8.828125,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.4912109,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.1152344,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 34564,
+          "logprob": -1.765625,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.024002075,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.10760498,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.19580078,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.7993164,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.046295166,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.002374649,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.7651367,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5829,
+          "logprob": -1.0107422,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Deep learning is a subset of machine learning that uses"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.5351562,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.85791016,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -8.828125,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.4882812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.1210938,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 34564,
+          "logprob": -1.7597656,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.024032593,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.10748291,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.19592285,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.7988281,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.008354187,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.046569824,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0023517609,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.7661133,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5829,
+          "logprob": -1.0107422,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Deep learning is a subset of machine learning that uses"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.5351562,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.85791016,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -8.828125,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.4882812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.1210938,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 34564,
+          "logprob": -1.7597656,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.024032593,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.10748291,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.19592285,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.7988281,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.008354187,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.046569824,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0023517609,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.7661133,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5829,
+          "logprob": -1.0107422,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Deep learning is a subset of machine learning that uses"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 3923,
+          "logprob": -7.5351562,
+          "text": "What"
+        },
+        {
+          "id": 374,
+          "logprob": -0.85791016,
+          "text": " is"
+        },
+        {
+          "id": 5655,
+          "logprob": -8.828125,
+          "text": " deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -1.4882812,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -2.1210938,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 34564,
+          "logprob": -1.7597656,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.024032593,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 374,
+          "logprob": -0.10748291,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 264,
+          "logprob": -0.19592285,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 27084,
+          "logprob": -0.7988281,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 315,
+          "logprob": -0.008354187,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5780,
+          "logprob": -0.046569824,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6975,
+          "logprob": -0.0023517609,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 430,
+          "logprob": -0.7661133,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 5829,
+          "logprob": -1.0107422,
+          "special": false,
+          "text": " uses"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Deep learning is a subset of machine learning that uses"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
@ -1,8 +1,8 @@
 {
  "details": {
    "best_of_sequences": null,
-    "finish_reason": "stop_sequence",
-    "generated_tokens": 5,
+    "finish_reason": "length",
+    "generated_tokens": 10,
    "prefill": [
      {
        "id": 128000,
@ -11,12 +11,12 @@
      },
      {
        "id": 2323,
-        "logprob": -9.5625,
+        "logprob": -9.5234375,
        "text": "Test"
      },
      {
        "id": 1715,
-        "logprob": -10.4375,
+        "logprob": -10.421875,
        "text": " request"
      }
    ],
@ -24,36 +24,66 @@
    "tokens": [
      {
        "id": 25,
-        "logprob": -0.8984375,
+        "logprob": -0.88183594,
        "special": false,
        "text": ":"
      },
      {
-        "id": 923,
-        "logprob": -2.84375,
+        "id": 2209,
+        "logprob": -2.6699219,
        "special": false,
-        "text": " add"
+        "text": " Is"
      },
      {
-        "id": 264,
-        "logprob": 0.0,
+        "id": 279,
+        "logprob": -0.61083984,
        "special": false,
-        "text": " a"
+        "text": " the"
+      },
+      {
+        "id": 734,
+        "logprob": -2.6660156,
+        "special": false,
+        "text": " function"
      },
      {
        "id": 330,
-        "logprob": -0.31640625,
+        "logprob": -0.35498047,
        "special": false,
        "text": " \""
      },
      {
-        "id": 1985,
-        "logprob": 0.0,
+        "id": 4110,
+        "logprob": -2.4101562,
        "special": false,
-        "text": "test"
+        "text": "Create"
+      },
+      {
+        "id": 7575,
+        "logprob": -2.2304688,
+        "special": false,
+        "text": "Process"
+      },
+      {
+        "id": 1,
+        "logprob": -0.080078125,
+        "special": false,
+        "text": "\""
+      },
+      {
+        "id": 304,
+        "logprob": -0.75439453,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 12468,
+        "logprob": -1.8769531,
+        "special": false,
+        "text": " Win"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": "Test request: add a \"test"
+  "generated_text": "Test request: Is the function \"CreateProcess\" in Win"
 }
--- a/integration-tests/models/snapshots/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
@ -16,17 +16,17 @@
      },
      {
        "id": 5655,
-        "logprob": -11.75,
+        "logprob": -11.8359375,
        "text": " deep"
      },
      {
        "id": 6975,
-        "logprob": -2.0625,
+        "logprob": -2.0703125,
        "text": " learning"
      },
      {
        "id": 30,
-        "logprob": -6.0,
+        "logprob": -5.9765625,
        "text": "?"
      }
    ],
@ -40,25 +40,25 @@
      },
      {
        "id": 34564,
-        "logprob": -0.11279297,
+        "logprob": -0.12512207,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
-        "logprob": -0.16015625,
+        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 320,
-        "logprob": -0.25195312,
+        "logprob": -0.23840332,
        "special": false,
        "text": " ("
      },
      {
        "id": 16931,
-        "logprob": -1.703125,
+        "logprob": -2.0175781,
        "special": false,
        "text": "DL"
      },
@ -70,7 +70,7 @@
      },
      {
        "id": 374,
-        "logprob": -1.140625,
+        "logprob": -0.8613281,
        "special": false,
        "text": " is"
      },
@ -82,7 +82,7 @@
      },
      {
        "id": 1207,
-        "logprob": -1.3125,
+        "logprob": -1.2451172,
        "special": false,
        "text": " sub"
      },
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1730164250,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-7B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": {
+    "completion_tokens": 58,
+    "prompt_tokens": 349,
+    "total_tokens": 407
+  }
+}
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": "",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1730416361,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-7B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.4.1-dev0-native",
+  "usage": null
+}
--- a/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
@ -21,22 +21,22 @@
      },
      {
        "id": 81,
-        "logprob": -0.25585938,
+        "logprob": -0.25927734,
        "text": "_"
      },
      {
        "id": 6009,
-        "logprob": -2.2304688,
+        "logprob": -2.2109375,
        "text": "mean"
      },
      {
        "id": 26,
-        "logprob": -0.29760742,
+        "logprob": -0.2993164,
        "text": "("
      },
      {
        "id": 62,
-        "logprob": -5.6796875,
+        "logprob": -5.671875,
        "text": "L"
      },
      {
@ -46,22 +46,22 @@
      },
      {
        "id": 1682,
-        "logprob": -0.67626953,
+        "logprob": -0.6777344,
        "text": " List"
      },
      {
        "id": 77,
-        "logprob": -0.38842773,
+        "logprob": -0.38354492,
        "text": "["
      },
      {
        "id": 1808,
-        "logprob": -0.9165039,
+        "logprob": -0.91845703,
        "text": "float"
      },
      {
        "id": 10794,
-        "logprob": -2.5527344,
+        "logprob": -2.5371094,
        "text": "]):"
      }
    ],
--- a/integration-tests/models/snapshots/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
+++ b/integration-tests/models/snapshots/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
@ -5,7 +5,7 @@
      "index": 0,
      "logprobs": null,
      "message": {
-        "content": "{\n  \"temperature\": [\n    35,\n    34,\n    36\n  ],\n  \"unit\": \"°c\"\n}",
+        "content": "{ \"temperature\": [ 26, 30, 33, 29 ] ,\"unit\": \"Fahrenheit\" }",
        "role": "assistant"
      }
    }
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
@ -18,7 +18,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
@ -44,7 +44,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
@ -70,7 +70,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
@ -96,7 +96,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "2.3.1-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 50,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 23,
    "prompt_tokens": 604,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@ -15,6 +15,6 @@
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion.chunk",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": null
 }
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
@ -15,6 +15,6 @@
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion.chunk",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": null
 }
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
@ -0,0 +1,27 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "role": "assistant",
+        "tool_calls": {
+          "function": {
+            "arguments": "<|eot_id|>",
+            "name": null
+          },
+          "id": "",
+          "index": 0,
+          "type": "function"
+        }
+      },
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1729084854,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.3.2-dev0-native",
+  "usage": null
+}
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": " deep",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1729262528,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.3.2-dev0-native",
+  "usage": null
+}
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
@ -0,0 +1,28 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": null,
+        "role": "assistant",
+        "tool_calls": {
+          "function": {
+            "arguments": "<|eot_id|>",
+            "name": null
+          },
+          "id": "",
+          "index": 0,
+          "type": "function"
+        }
+      },
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1729084850,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "2.3.2-dev0-native",
+  "usage": null
+}
--- a/integration-tests/models/test_bloom_560m.py
+++ b/integration-tests/models/test_bloom_560m.py
@ -3,7 +3,7 @@ import pytest

@pytest.fixture(scope="module")
 def bloom_560_handle(launcher):
-    with launcher("bigscience/bloom-560m") as handle:
+    with launcher("bigscience/bloom-560m", num_shard=1) as handle:
        yield handle


--- a/integration-tests/models/test_compressed_tensors_w8a8_int.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int.py
@ -0,0 +1,90 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_w8a8_int_handle(launcher):
+    with launcher(
+        "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_w8a8_int(compressed_tensors_w8a8_int_handle):
+    await compressed_tensors_w8a8_int_handle.health(300)
+    return compressed_tensors_w8a8_int_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8a8_int(
+    compressed_tensors_w8a8_int, response_snapshot
+):
+    response = await compressed_tensors_w8a8_int.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == " and how does it differ from traditional machine learning?\n"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8a8_int_all_params(
+    compressed_tensors_w8a8_int, response_snapshot
+):
+    response = await compressed_tensors_w8a8_int.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep learning, also known as neural network or"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8a8_int_load(
+    compressed_tensors_w8a8_int, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_w8a8_int,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == " and how does it differ from traditional machine learning?\n"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
@ -0,0 +1,92 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_w8a8_int_dynamic_weight_handle(launcher):
+    with launcher(
+        "danieldk/Qwen2.5-1.5B-Instruct-w8a8-int-dynamic-weight",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_w8a8_int_dynamic_weight(
+    compressed_tensors_w8a8_int_dynamic_weight_handle,
+):
+    await compressed_tensors_w8a8_int_dynamic_weight_handle.health(300)
+    return compressed_tensors_w8a8_int_dynamic_weight_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8a8_int_dynamic_weight(
+    compressed_tensors_w8a8_int_dynamic_weight, response_snapshot
+):
+    response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == " Deep learning is a subset of machine learning that uses"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
+    compressed_tensors_w8a8_int_dynamic_weight, response_snapshot
+):
+    response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\n\nDeep Learning is an area of artificial intelligence"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8a8_int_dynamic_weight_load(
+    compressed_tensors_w8a8_int_dynamic_weight, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_w8a8_int_dynamic_weight,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == " Deep learning is a subset of machine learning that uses"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_w8an_fp.py
+++ b/integration-tests/models/test_compressed_tensors_w8an_fp.py
@ -0,0 +1,86 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_w8an_handle(launcher):
+    with launcher(
+        "neuralmagic/Llama-3.2-1B-Instruct-FP8",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
+    await compressed_tensors_w8an_handle.health(300)
+    return compressed_tensors_w8an_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
+    response = await compressed_tensors_w8an.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == " Deep learning is a type of artificial intelligence (AI"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_compressed_tensors_w8an_all_params(
+    compressed_tensors_w8an, response_snapshot
+):
+    response = await compressed_tensors_w8an.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep learning, also known as neural network or"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_w8an_load(
+    compressed_tensors_w8an, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_w8an,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == " Deep learning is a type of artificial intelligence (AI"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_wna16_int.py
+++ b/integration-tests/models/test_compressed_tensors_wna16_int.py
@ -0,0 +1,86 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_wna16_handle(launcher):
+    with launcher(
+        "neuralmagic/gemma-2-2b-it-quantized.w4a16",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
+    await compressed_tensors_wna16_handle.health(300)
+    return compressed_tensors_wna16_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
+    response = await compressed_tensors_wna16.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == "\n\nDeep learning is a subset of machine learning that"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_compressed_tensors_wna16_all_params(
+    compressed_tensors_wna16, response_snapshot
+):
+    response = await compressed_tensors_wna16.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\n\nDeep Learning is a subset of machine learning"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16_load(
+    compressed_tensors_wna16, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_wna16,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == "\n\nDeep learning is a subset of machine learning that"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_compressed_tensors_wna16_int_24.py
+++ b/integration-tests/models/test_compressed_tensors_wna16_int_24.py
@ -0,0 +1,90 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def compressed_tensors_wna16_int_24_handle(launcher):
+    with launcher(
+        "danieldk/Llama-3.1-8B-w4a16-int-24",
+        num_shard=2,
+        quantize="compressed-tensors",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def compressed_tensors_wna16_int_24(compressed_tensors_wna16_int_24_handle):
+    await compressed_tensors_wna16_int_24_handle.health(300)
+    return compressed_tensors_wna16_int_24_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16_int_24(
+    compressed_tensors_wna16_int_24, response_snapshot
+):
+    response = await compressed_tensors_wna16_int_24.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert (
+        response.generated_text
+        == "Deep learning is a subset of machine learning that uses"
+    )
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16_int_24_all_params(
+    compressed_tensors_wna16_int_24, response_snapshot
+):
+    response = await compressed_tensors_wna16_int_24.generate(
+        "What is deep learning",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep learning (DL) is a subset of"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_compressed_tensors_wna16_int_24_load(
+    compressed_tensors_wna16_int_24, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        compressed_tensors_wna16_int_24,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    assert (
+        responses[0].generated_text
+        == "Deep learning is a subset of machine learning that uses"
+    )
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@ -0,0 +1,80 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_vl_handle(launcher):
+    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_vl_handle):
+    await flash_qwen2_vl_handle.health(300)
+    return flash_qwen2_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        max_tokens=100,
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
+    responses = await flash_qwen2.chat(
+        max_tokens=100,
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        generated += response.choices[0].delta.content
+        last_response = response
+
+    assert (
+        generated
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    )
+    assert count == 58
+    assert last_response == response_snapshot
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
@ -55,6 +55,7 @@ async def test_flash_starcoder_gptq_load(
    )

    assert len(responses) == 4
-    assert all([r.generated_text == responses[0].generated_text for r in responses])
+    # XXX: TODO: Fix this test.
+    # assert all([r.generated_text == responses[0].generated_text for r in responses])

-    assert responses == generous_response_snapshot
+    # assert responses == generous_response_snapshot
--- a/integration-tests/models/test_grammar_response_format_llama.py
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@ -55,10 +55,7 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
    called = chat_completion["choices"][0]["message"]["content"]

    assert response.status_code == 200
-    assert (
-        called
-        == '{\n  "temperature": [\n    35,\n    34,\n    36\n  ],\n  "unit": "°c"\n}'
-    )
+    assert called == '{ "temperature": [ 26, 30, 33, 29 ] ,"unit": "Fahrenheit" }'
    assert chat_completion == response_snapshot


--- a/integration-tests/models/test_mamba.py
+++ b/integration-tests/models/test_mamba.py
@ -3,7 +3,7 @@ import pytest

@pytest.fixture(scope="module")
 def fused_kernel_mamba_handle(launcher):
-    with launcher("state-spaces/mamba-130m", num_shard=1) as handle:
+    with launcher("state-spaces/mamba-130m-hf", num_shard=1) as handle:
        yield handle


--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@ -79,12 +79,12 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
    ]
    responses = await asyncio.gather(*futures)

-    generated_texts = [response.choices[0].message.content for response in responses]
+    _ = [response.choices[0].message.content for response in responses]

-    assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
-    assert len(generated_texts) == 4
-    assert generated_texts, all(
-        [text == generated_texts[0] for text in generated_texts]
-    )
-
-    assert responses == response_snapshot
+    # XXX: TODO: Fix this test.
+    # assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
+    # assert len(generated_texts) == 4
+    # assert generated_texts, all(
+    #     [text == generated_texts[0] for text in generated_texts]
+    # )
+    # assert responses == response_snapshot
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@ -1,4 +1,6 @@
 import pytest
+import requests
+import json


@pytest.fixture(scope="module")
@ -174,7 +176,7 @@ async def test_flash_llama_grammar_tools_choice(
            "function": {
                "description": None,
                "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "Brooklyn, NY"},
+                "arguments": {"format": "celsius", "location": "Brooklyn, New York"},
            },
        }
    ]
@ -327,3 +329,142 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream(
        == "Once upon a time, in the ocean, there lived three sea creatures. There was a wise old octopus named Bob, a mischievous seagull named Sam, and a gentle sea turtle named Luna. They all lived together in a beautiful coral reef, surrounded by colorful fish and swaying sea fans"
    )
    assert last_response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=24,
+        tools=tools,
+        tool_choice="required",
+        messages=[
+            {
+                "role": "system",
+                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
+            },
+            {
+                "role": "user",
+                "content": "Tell me a story about 3 sea creatures",
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    tool_calls_generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        assert response.choices[0].delta.content is None
+        tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments
+        last_response = response
+
+    assert count == 29
+    assert (
+        tool_calls_generated
+        == '{"function": {"_name": "get_current_weather", "format": "celsius", "location": "San Francisco, CA"}}<|eot_id|>'
+    )
+    assert last_response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_sea_creatures_stream_none(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=24,
+        tools=tools,
+        tool_choice="none",
+        messages=[
+            {
+                "role": "system",
+                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
+            },
+            {
+                "role": "user",
+                "content": "Tell me a story about 3 sea creatures",
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    content_generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        content_generated += response.choices[0].delta.content
+        last_response = response
+        assert response.choices[0].delta.tool_calls is None
+
+    assert count == 100
+    print(content_generated)
+    assert (
+        content_generated
+        == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish, lived three dear friends: Luna the sea turtle, Finley the friendly fish, and Crusty the wise crab.\n\nLuna was the oldest of the three. She had traveled the world, exploring hidden caves and shipwrecks, and collecting sparkling shells and shiny pebbles. Her shell was a beautiful mosaic of blues and greens, and her gentle eyes twinkled with the secrets of the deep"
+    )
+    assert last_response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
+    flash_llama_grammar_tools, response_snapshot
+):
+    # using `requests` to send the request until the client library supports tool_choice as a function object
+    responses = requests.post(
+        f"{flash_llama_grammar_tools.base_url}/v1/chat/completions",
+        headers=flash_llama_grammar_tools.headers,
+        json={
+            "model": "tgi",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
+                },
+                {
+                    "role": "user",
+                    "content": "Tell me a story about 3 sea creatures",
+                },
+            ],
+            "tools": tools,
+            "tool_choice": {
+                "type": "function",
+                "function": {"name": "get_n_day_weather_forecast"},
+            },
+            "seed": 24,
+            "max_tokens": 100,
+            "stream": True,
+        },
+        stream=True,
+    )
+    # iterate over the response in chunks
+    count = 0
+    tool_calls_generated = ""
+    last_response = None
+    for chunk in responses.iter_content(chunk_size=1024):
+        if chunk:
+            count += 1
+            # remove the "data: " prefix, trailing newline, and split the chunk into individual lines
+            lines = chunk.decode("utf-8").replace("data: ", "").rstrip("\n").split("\n")
+            for line in lines:
+                if line == "[DONE]":
+                    break
+                response = json.loads(line)
+                tool_calls_generated += response["choices"][0]["delta"]["tool_calls"][
+                    "function"
+                ]["arguments"]
+                last_response = response
+
+    assert count == 39
+    assert (
+        tool_calls_generated
+        == '{"function": {"_name": "get_n_day_weather_forecast", "format": "celsius", "location": "San Francisco, CA", "num_days":3}}<|eot_id|>'
+    )
+    assert last_response == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -212,6 +212,8 @@ enum Quantization {
    ///   <https://hf.co/models?search=awq>.
    /// Should replace GPTQ models wherever possible because of the better latency
    Awq,
+    /// Compressed tensors, which can be a mixture of different quantization methods.
+    CompressedTensors,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
            Quantization::Awq => {
                write!(f, "awq")
            }
+            Quantization::CompressedTensors => {
+                write!(f, "compressed-tensors")
+            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
@ -472,7 +477,7 @@ struct Args {
    /// for users. The larger this value, the longer prompt users can send which
    /// can impact the overall memory required to handle the load.
    /// Please note that some models have a finite range of sequence they can handle.
-    /// Default to min(max_position_embeddings - 1, 4095)
+    /// Default to min(max_allocatable, max_position_embeddings) - 1
    #[clap(long, env)]
    max_input_tokens: Option<usize>,

@ -488,7 +493,7 @@ struct Args {
    /// `1511` max_new_tokens.
    /// The larger this value, the larger amount each request will be in your RAM
    /// and the less effective batching can be.
-    /// Default to min(max_position_embeddings, 4096)
+    /// Default to min(max_allocatable, max_position_embeddings)
    #[clap(long, env)]
    max_total_tokens: Option<usize>,

@ -687,6 +692,12 @@ struct Args {
    /// Defaul is on.
    #[clap(default_value = "on", long, env)]
    usage_stats: UsageStatsLevel,
+
+    /// Payload size limit in bytes
+    ///
+    /// Default is 2MB
+    #[clap(default_value = "2000000", long, env)]
+    payload_limit: usize,
 }

 #[derive(Debug)]
@ -718,9 +729,9 @@ fn shard_manager(
    cuda_memory_fraction: f32,
    rope_scaling: Option<RopeScaling>,
    rope_factor: Option<f32>,
-    max_total_tokens: usize,
+    max_total_tokens: Option<usize>,
    max_batch_size: Option<usize>,
-    max_input_tokens: usize,
+    max_input_tokens: Option<usize>,
    lora_adapters: Option<String>,
    otlp_endpoint: Option<String>,
    otlp_service_name: String,
@ -805,8 +816,10 @@ fn shard_manager(
    shard_args.push(otlp_service_name);

    // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
+    if let Some(max_input_tokens) = max_input_tokens {
        shard_args.push("--max-input-tokens".to_string());
        shard_args.push(max_input_tokens.to_string());
+    }

    // Copy current process env
    let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
@ -854,10 +867,12 @@ fn shard_manager(
        envs.push(("ROPE_FACTOR".into(), factor.to_string().into()));
    }

+    if let Some(max_total_tokens) = max_total_tokens {
        envs.push((
            "MAX_TOTAL_TOKENS".into(),
            max_total_tokens.to_string().into(),
        ));
+    }
    if let Some(max_batch_size) = max_batch_size {
        envs.push(("MAX_BATCH_SIZE".into(), max_batch_size.to_string().into()));
    }
@ -1315,8 +1330,8 @@ fn spawn_shards(
    num_shard: usize,
    args: &Args,
    cuda_graphs: Vec<usize>,
-    max_total_tokens: usize,
-    max_input_tokens: usize,
+    max_total_tokens: Option<usize>,
+    max_input_tokens: Option<usize>,
    quantize: Option<Quantization>,
    max_log_level: LevelFilter,
    shutdown: Arc<AtomicBool>,
@ -1434,8 +1449,8 @@ fn compute_type(num_shard: usize) -> Option<String> {
 fn spawn_webserver(
    num_shard: usize,
    args: Args,
-    max_input_tokens: usize,
-    max_total_tokens: usize,
+    max_input_tokens: Option<usize>,
+    max_total_tokens: Option<usize>,
    max_batch_prefill_tokens: u32,
    shutdown: Arc<AtomicBool>,
    shutdown_receiver: &mpsc::Receiver<()>,
@ -1454,10 +1469,6 @@ fn spawn_webserver(
        args.max_stop_sequences.to_string(),
        "--max-top-n-tokens".to_string(),
        args.max_top_n_tokens.to_string(),
-        "--max-input-tokens".to_string(),
-        max_input_tokens.to_string(),
-        "--max-total-tokens".to_string(),
-        max_total_tokens.to_string(),
        "--max-batch-prefill-tokens".to_string(),
        max_batch_prefill_tokens.to_string(),
        "--waiting-served-ratio".to_string(),
@ -1474,7 +1485,21 @@ fn spawn_webserver(
        format!("{}-0", args.shard_uds_path),
        "--tokenizer-name".to_string(),
        args.model_id,
+        "--payload-limit".to_string(),
+        args.payload_limit.to_string(),
    ];
+    if let Some(max_input_tokens) = max_input_tokens {
+        router_args.extend_from_slice(&[
+            "--max-input-tokens".to_string(),
+            max_input_tokens.to_string(),
+        ]);
+    }
+    if let Some(max_total_tokens) = max_total_tokens {
+        router_args.extend_from_slice(&[
+            "--max-total-tokens".to_string(),
+            max_total_tokens.to_string(),
+        ]);
+    }

    // Pass usage stats flags to router
    router_args.push("--usage-stats".to_string());
@ -1675,13 +1700,6 @@ fn main() -> Result<(), LauncherError> {
    let max_position_embeddings = if let Some(config) = &config {
        if let Some(max_position_embeddings) = config.max_position_embeddings {
            if max_position_embeddings > max_default {
-                let max = max_position_embeddings;
-                if args.max_input_tokens.is_none()
-                    && args.max_total_tokens.is_none()
-                    && args.max_batch_prefill_tokens.is_none()
-                {
-                    tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
-                }
                max_default
            } else {
                max_position_embeddings
@ -1704,35 +1722,19 @@ fn main() -> Result<(), LauncherError> {
                    format!("Both `max_input_tokens` ({max_input_tokens}) and `max_input_length` ({max_input_length}) are set. Please define only `max_input_tokens` as `max_input_length is deprecated for naming consistency.",
                )));
            }
-            (Some(max_input_tokens), None) | (None, Some(max_input_tokens)) => max_input_tokens,
-            (None, None) => {
-                let value = max_position_embeddings - 1;
-                tracing::info!("Default `max_input_tokens` to {value}");
-                value
-            }
-        }
-    };
-    let max_total_tokens = {
-        match args.max_total_tokens {
-            Some(max_total_tokens) => max_total_tokens,
-            None => {
-                let value = max_position_embeddings;
-                tracing::info!("Default `max_total_tokens` to {value}");
-                value
+            (Some(max_input_tokens), None) | (None, Some(max_input_tokens)) => {
+                Some(max_input_tokens)
            }
+            (None, None) => None,
        }
    };
+    let max_total_tokens = args.max_total_tokens;
    let max_batch_prefill_tokens = {
        match args.max_batch_prefill_tokens {
            Some(max_batch_prefill_tokens) => max_batch_prefill_tokens,
            None => {
-                let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
-                    max_batch_size * max_input_tokens
-                } else {
-                    // Adding some edge in order to account for potential block_size alignement
-                    // issue.
-                    max_input_tokens + 50
-                } as u32;
+                // TODO figure out hardware optimal value
+                let value = 4096.min(max_position_embeddings as u32);
                tracing::info!("Default `max_batch_prefill_tokens` to {value}");
                value
            }
@ -1740,11 +1742,13 @@ fn main() -> Result<(), LauncherError> {
    };

    // Validate args
+    if let (Some(max_input_tokens), Some(max_total_tokens)) = (max_input_tokens, max_total_tokens) {
        if max_input_tokens >= max_total_tokens {
            return Err(LauncherError::ArgumentValidation(
-            "`max_input_tokens must be < `max_total_tokens`".to_string(),
+                    format!("`max_input_tokens`({max_input_tokens}) must be < `max_total_tokens`({max_total_tokens})"),
                ));
        }
+    }

    if matches!(args.quantize, Some(Quantization::Bitsandbytes)) {
        tracing::warn!("Bitsandbytes is deprecated, use `eetq` instead, which provides better latencies overall and is drop-in in most cases.");
@ -1798,6 +1802,7 @@ fn main() -> Result<(), LauncherError> {
    }

    if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens {
+        if let Some(max_total_tokens) = max_total_tokens {
            if max_total_tokens as u32 > *max_batch_total_tokens {
                return Err(LauncherError::ArgumentValidation(format!(
                    "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
@ -1805,6 +1810,7 @@ fn main() -> Result<(), LauncherError> {
                )));
            }
        }
+    }

    if args.ngrok {
        if args.ngrok_authtoken.is_none() {
--- a/load_tests/Makefile
+++ b/load_tests/Makefile
@ -1,9 +0,0 @@
-
-ShareGPT_V3_unfiltered_cleaned_split.json:
-	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
-	python filter.py
-
-prepare_orca:
-	python orca.py
--- a/load_tests/benchmarks.py
+++ b/load_tests/benchmarks.py
@ -0,0 +1,242 @@
+import argparse
+import datetime
+import json
+import os
+import traceback
+from typing import Dict, Tuple, List
+
+import GPUtil
+import docker
+from docker.models.containers import Container
+from loguru import logger
+import pandas as pd
+
+
+class InferenceEngineRunner:
+    def __init__(self, model: str):
+        self.model = model
+
+    def run(self, parameters: list[tuple], gpus: int = 0):
+        NotImplementedError("This method should be implemented by the subclass")
+
+    def stop(self):
+        NotImplementedError("This method should be implemented by the subclass")
+
+
+class TGIDockerRunner(InferenceEngineRunner):
+    def __init__(self,
+                 model: str,
+                 image: str = "ghcr.io/huggingface/text-generation-inference:latest",
+                 volumes=None):
+        super().__init__(model)
+        if volumes is None:
+            volumes = []
+        self.container = None
+        self.image = image
+        self.volumes = volumes
+
+    def run(self, parameters: list[tuple], gpus: int = 0):
+        params = f"--model-id {self.model} --port 8080"
+        for p in parameters:
+            params += f" --{p[0]} {str(p[1])}"
+        logger.info(f"Running TGI with parameters: {params}")
+        volumes = {}
+        for v in self.volumes:
+            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
+        self.container = run_docker(self.image, params,
+                                    "Connected",
+                                    "ERROR",
+                                    volumes=volumes,
+                                    gpus=gpus,
+                                    ports={"8080/tcp": 8080}
+                                    )
+
+    def stop(self):
+        if self.container:
+            self.container.stop()
+
+
+class BenchmarkRunner:
+    def __init__(self,
+                 image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest",
+                 volumes: List[Tuple[str, str]] = None):
+        if volumes is None:
+            volumes = []
+        self.container = None
+        self.image = image
+        self.volumes = volumes
+
+    def run(self, parameters: list[tuple], network_mode):
+        params = "text-generation-inference-benchmark"
+        for p in parameters:
+            params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}"
+        logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}")
+        volumes = {}
+        for v in self.volumes:
+            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
+        self.container = run_docker(self.image, params,
+                                    "Benchmark finished",
+                                    "Fatal:",
+                                    volumes=volumes,
+                                    extra_env={"RUST_LOG": "text_generation_inference_benchmark=info",
+                                               "RUST_BACKTRACE": "full"},
+                                    network_mode=network_mode)
+
+    def stop(self):
+        if self.container:
+            self.container.stop()
+
+
+def run_docker(image: str, args: str, success_sentinel: str,
+               error_sentinel: str, ports: Dict[str, int] = None, volumes=None, network_mode: str = "bridge",
+               gpus: int = 0, extra_env: Dict[str, str] = None) -> Container:
+    if ports is None:
+        ports = {}
+    if volumes is None:
+        volumes = {}
+    if extra_env is None:
+        extra_env = {}
+    client = docker.from_env(timeout=300)
+    # retrieve the GPU devices from CUDA_VISIBLE_DEVICES
+    devices = [f"{i}" for i in
+               range(get_num_gpus())][:gpus]
+    environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
+    environment.update(extra_env)
+    container = client.containers.run(image, args,
+                                      detach=True,
+                                      device_requests=[
+                                          docker.types.DeviceRequest(device_ids=devices,
+                                                                     capabilities=[['gpu']])
+                                      ] if gpus > 0 else None,
+                                      volumes=volumes,
+                                      shm_size="1g",
+                                      ports=ports,
+                                      network_mode=network_mode,
+                                      environment=environment, )
+    for line in container.logs(stream=True):
+        print(line.decode("utf-8"), end="")
+        if success_sentinel.encode("utf-8") in line:
+            break
+        if error_sentinel.encode("utf-8") in line:
+            container.stop()
+            raise Exception(f"Error starting container: {line}")
+    return container
+
+
+def get_gpu_names() -> str:
+    gpus = GPUtil.getGPUs()
+    if len(gpus) == 0:
+        return ''
+    return f'{len(gpus)}x{gpus[0].name if gpus else "No GPU available"}'
+
+
+def get_gpu_name() -> str:
+    gpus = GPUtil.getGPUs()
+    if len(gpus) == 0:
+        return ''
+    return gpus[0].name
+
+
+def get_num_gpus() -> int:
+    return len(GPUtil.getGPUs())
+
+
+def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
+    df = pd.DataFrame()
+    now = datetime.datetime.now(datetime.timezone.utc)
+    created_at = now.isoformat()  # '2024-10-02T11:53:17.026215+00:00'
+    # Load the results
+    for key, filename in data_files.items():
+        with open(filename, 'r') as f:
+            data = json.load(f)
+            for result in data['results']:
+                entry = result
+                [config] = pd.json_normalize(result['config']).to_dict(orient='records')
+                entry.update(config)
+                entry['engine'] = data['config']['meta']['engine']
+                entry['tp'] = data['config']['meta']['tp']
+                entry['version'] = data['config']['meta']['version']
+                entry['model'] = model
+                entry['created_at'] = created_at
+                del entry['config']
+                df = pd.concat([df, pd.DataFrame(entry, index=[0])])
+    return df
+
+
+def main(sha, results_file):
+    results_dir = 'results'
+    # get absolute path
+    results_dir = os.path.join(os.path.dirname(__file__), results_dir)
+    logger.info('Starting benchmark')
+    models = [
+        ('meta-llama/Llama-3.1-8B-Instruct', 1),
+        # ('meta-llama/Llama-3.1-70B-Instruct', 4),
+        # ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
+    ]
+    success = True
+    for model in models:
+        tgi_runner = TGIDockerRunner(model[0])
+        # create results directory
+        model_dir = os.path.join(results_dir, f'{model[0].replace("/", "_").replace(".", "_")}')
+        os.makedirs(model_dir, exist_ok=True)
+        runner = BenchmarkRunner(
+            volumes=[(model_dir, '/opt/text-generation-inference-benchmark/results')]
+        )
+        try:
+            tgi_runner.run([('max-concurrent-requests', 512)], gpus=model[1])
+            logger.info(f'TGI started for model {model[0]}')
+            parameters = [
+                ('tokenizer-name', model[0]),
+                ('max-vus', 800),
+                ('url', 'http://localhost:8080'),
+                ('duration', '120s'),
+                ('warmup', '30s'),
+                ('benchmark-kind', 'rate'),
+                ('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
+                ('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
+                ('extra-meta', f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"'),
+                ('no-console', None)
+            ]
+            rates = [('rates', f'{r / 10.}') for r in list(range(8, 248, 8))]
+            parameters.extend(rates)
+            runner.run(parameters, f'container:{tgi_runner.container.id}')
+        except Exception as e:
+            logger.error(f'Error running benchmark for model {model[0]}: {e}')
+            # print the stack trace
+            print(traceback.format_exc())
+            success = False
+        finally:
+            tgi_runner.stop()
+            runner.stop()
+    if not success:
+        logger.error('Some benchmarks failed')
+        exit(1)
+
+    df = pd.DataFrame()
+    # list recursively directories
+    directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')]
+    logger.info(f'Found result directories: {directories}')
+    for directory in directories:
+        data_files = {}
+        for filename in os.listdir(directory):
+            if filename.endswith('.json'):
+                data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
+        logger.info(f'Processing directory {directory}')
+        df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
+    df['device'] = get_gpu_name()
+    df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
+    df.to_parquet(results_file)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sha", help="SHA of the commit to add to the results", required=True)
+    parser.add_argument("--results-file",
+                        help="The file where to store the results, can be a local file or a s3 path")
+    args = parser.parse_args()
+    if args.results_file is None:
+        results_file = f'{args.sha}.parquet'
+    else:
+        results_file = args.results_file
+
+    main(args.sha, results_file)
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -1,94 +0,0 @@
-import { check } from 'k6';
-import { scenario } from 'k6/execution';
-import http from 'k6/http';
-import { Trend, Counter } from 'k6/metrics';
-
-const host = __ENV.HOST;
-const model_id = __ENV.MODEL_ID;
-const timePerToken = new Trend('time_per_token', true);
-const tokens = new Counter('tokens');
-const new_tokens = new Counter('new_tokens');
-const input_tokens = new Counter('input_tokens');
-const max_new_tokens = 50;
-
-// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
-const shareGPT = JSON.parse(open("small.json"))
-
-
-export function get_options() {
-    return {
-        thresholds: {
-            http_req_failed: ['rate==0'],
-            // time_per_token: [{
-            //     threshold: `p(50)<${5 * reference_latency_ms}`,
-            //     abortOnFail: true,
-            //     delayAbortEval: '10s'
-            // }],
-        },
-        scenarios: {
-            // single_user: {
-            //     executor: 'constant-arrival-rate',
-            //     duration: '60s',
-            //     preAllocatedVUs: 1,
-            //     rate: 20,
-            //     timeUnit: '1s',
-            // },
-            // load_test: {
-            //     executor: 'constant-arrival-rate',
-            //     duration: '60s',
-            //     preAllocatedVUs: 100,
-            //     rate: 1,
-            //     timeUnit: '1s',
-            // },
-            // breakpoint: {
-            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
-            //     preAllocatedVUs: 300,
-            //     stages: [
-            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
-            //     ],
-            // },
-            throughput: {
-                executor: 'shared-iterations',
-                vus: 100,
-                iterations: 200,
-                maxDuration: '40s',
-            },
-        },
-    };
-}
-
-function generate_payload(gpt, max_new_tokens) {
-    const input = gpt["conversations"][0]["value"];
-    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
-}
-
-export const options = get_options();
-
-export default function run() {
-    const headers = { 'Content-Type': 'application/json' };
-    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
-    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
-    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
-        headers,
-    });
-    if (res.status >= 400 && res.status < 500) {
-        return;
-    }
-
-
-    check(res, {
-        'Post status is 200': (res) => res.status === 200,
-    });
-    const duration = res.timings.duration;
-
-    if (res.status === 200) {
-        const body = res.json();
-        const completion_tokens = body.usage.completion_tokens;
-        const latency_ms_per_token = duration / completion_tokens;
-        timePerToken.add(latency_ms_per_token);
-        const prompt_tokens = body.usage.prompt_tokens;
-        input_tokens.add(prompt_tokens);
-        new_tokens.add(completion_tokens);
-        tokens.add(completion_tokens + prompt_tokens);
-    }
-}
--- a/load_tests/filter.py
+++ b/load_tests/filter.py
@ -1,26 +0,0 @@
-import json
-
-
-def main():
-    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
-        data = json.load(f)
-
-    # Select only the first 2k conversations that start with a human.
-    max = 2000
-    conversations = []
-    for conversation in data:
-        conv = conversation.get("conversations")
-        if conv and conv[0]["from"] == "human":
-            # Trim the rest of the output
-            conversation["conversations"] = conversation["conversations"][:1]
-            conversations.append(conversation)
-
-            if len(conversation) >= max:
-                break
-
-    with open("./small.json", "w") as f:
-        data = json.dump(conversations, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
--- a/load_tests/orca.py
+++ b/load_tests/orca.py
@ -1,27 +0,0 @@
-import json
-import datasets
-import tqdm
-
-
-def main():
-    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
-    # Select only the first 2k conversations that start with a human.
-    max = min(2000, len(dataset))
-    conversations = []
-    for item in tqdm.tqdm(dataset, total=max):
-        conversation = {
-            "conversations": [
-                {"from": "human", "value": item["question"]},
-            ],
-            "id": item["id"],
-        }
-        conversations.append(conversation)
-        if len(conversations) >= max:
-            break
-
-    with open("./small.json", "w") as f:
-        json.dump(conversations, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
--- a/load_tests/poetry.lock
+++ b/load_tests/poetry.lock
@ -0,0 +1,540 @@
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+
+[[package]]
+name = "certifi"
+version = "2024.8.30"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
+    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.3.2"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
+    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "docker"
+version = "7.1.0"
+description = "A Python library for the Docker Engine API."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
+    {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
+]
+
+[package.dependencies]
+pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
+requests = ">=2.26.0"
+urllib3 = ">=1.26.0"
+
+[package.extras]
+dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
+docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
+ssh = ["paramiko (>=2.4.3)"]
+websockets = ["websocket-client (>=1.3.0)"]
+
+[[package]]
+name = "gputil"
+version = "1.4.0"
+description = "GPUtil is a Python module for getting the GPU status from NVIDA GPUs using nvidia-smi."
+optional = false
+python-versions = "*"
+files = [
+    {file = "GPUtil-1.4.0.tar.gz", hash = "sha256:099e52c65e512cdfa8c8763fca67f5a5c2afb63469602d5dcb4d296b3661efb9"},
+]
+
+[[package]]
+name = "idna"
+version = "3.10"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
+]
+
+[package.extras]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
+
+[[package]]
+name = "loguru"
+version = "0.7.2"
+description = "Python logging made (stupidly) simple"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"},
+    {file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"},
+]
+
+[package.dependencies]
+colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
+win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "mypy (==v1.5.1)", "pre-commit (==3.4.0)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"]
+
+[[package]]
+name = "numpy"
+version = "2.1.1"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "numpy-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8a0e34993b510fc19b9a2ce7f31cb8e94ecf6e924a40c0c9dd4f62d0aac47d9"},
+    {file = "numpy-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7dd86dfaf7c900c0bbdcb8b16e2f6ddf1eb1fe39c6c8cca6e94844ed3152a8fd"},
+    {file = "numpy-2.1.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:5889dd24f03ca5a5b1e8a90a33b5a0846d8977565e4ae003a63d22ecddf6782f"},
+    {file = "numpy-2.1.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:59ca673ad11d4b84ceb385290ed0ebe60266e356641428c845b39cd9df6713ab"},
+    {file = "numpy-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:13ce49a34c44b6de5241f0b38b07e44c1b2dcacd9e36c30f9c2fcb1bb5135db7"},
+    {file = "numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:913cc1d311060b1d409e609947fa1b9753701dac96e6581b58afc36b7ee35af6"},
+    {file = "numpy-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:caf5d284ddea7462c32b8d4a6b8af030b6c9fd5332afb70e7414d7fdded4bfd0"},
+    {file = "numpy-2.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:57eb525e7c2a8fdee02d731f647146ff54ea8c973364f3b850069ffb42799647"},
+    {file = "numpy-2.1.1-cp310-cp310-win32.whl", hash = "sha256:9a8e06c7a980869ea67bbf551283bbed2856915f0a792dc32dd0f9dd2fb56728"},
+    {file = "numpy-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:d10c39947a2d351d6d466b4ae83dad4c37cd6c3cdd6d5d0fa797da56f710a6ae"},
+    {file = "numpy-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d07841fd284718feffe7dd17a63a2e6c78679b2d386d3e82f44f0108c905550"},
+    {file = "numpy-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b5613cfeb1adfe791e8e681128f5f49f22f3fcaa942255a6124d58ca59d9528f"},
+    {file = "numpy-2.1.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0b8cc2715a84b7c3b161f9ebbd942740aaed913584cae9cdc7f8ad5ad41943d0"},
+    {file = "numpy-2.1.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:b49742cdb85f1f81e4dc1b39dcf328244f4d8d1ded95dea725b316bd2cf18c95"},
+    {file = "numpy-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8d5f8a8e3bc87334f025194c6193e408903d21ebaeb10952264943a985066ca"},
+    {file = "numpy-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d51fc141ddbe3f919e91a096ec739f49d686df8af254b2053ba21a910ae518bf"},
+    {file = "numpy-2.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:98ce7fb5b8063cfdd86596b9c762bf2b5e35a2cdd7e967494ab78a1fa7f8b86e"},
+    {file = "numpy-2.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:24c2ad697bd8593887b019817ddd9974a7f429c14a5469d7fad413f28340a6d2"},
+    {file = "numpy-2.1.1-cp311-cp311-win32.whl", hash = "sha256:397bc5ce62d3fb73f304bec332171535c187e0643e176a6e9421a6e3eacef06d"},
+    {file = "numpy-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:ae8ce252404cdd4de56dcfce8b11eac3c594a9c16c231d081fb705cf23bd4d9e"},
+    {file = "numpy-2.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c803b7934a7f59563db459292e6aa078bb38b7ab1446ca38dd138646a38203e"},
+    {file = "numpy-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6435c48250c12f001920f0751fe50c0348f5f240852cfddc5e2f97e007544cbe"},
+    {file = "numpy-2.1.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:3269c9eb8745e8d975980b3a7411a98976824e1fdef11f0aacf76147f662b15f"},
+    {file = "numpy-2.1.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:fac6e277a41163d27dfab5f4ec1f7a83fac94e170665a4a50191b545721c6521"},
+    {file = "numpy-2.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fcd8f556cdc8cfe35e70efb92463082b7f43dd7e547eb071ffc36abc0ca4699b"},
+    {file = "numpy-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b9cd92c8f8e7b313b80e93cedc12c0112088541dcedd9197b5dee3738c1201"},
+    {file = "numpy-2.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:afd9c680df4de71cd58582b51e88a61feed4abcc7530bcd3d48483f20fc76f2a"},
+    {file = "numpy-2.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8661c94e3aad18e1ea17a11f60f843a4933ccaf1a25a7c6a9182af70610b2313"},
+    {file = "numpy-2.1.1-cp312-cp312-win32.whl", hash = "sha256:950802d17a33c07cba7fd7c3dcfa7d64705509206be1606f196d179e539111ed"},
+    {file = "numpy-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:3fc5eabfc720db95d68e6646e88f8b399bfedd235994016351b1d9e062c4b270"},
+    {file = "numpy-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:046356b19d7ad1890c751b99acad5e82dc4a02232013bd9a9a712fddf8eb60f5"},
+    {file = "numpy-2.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6e5a9cb2be39350ae6c8f79410744e80154df658d5bea06e06e0ac5bb75480d5"},
+    {file = "numpy-2.1.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:d4c57b68c8ef5e1ebf47238e99bf27657511ec3f071c465f6b1bccbef12d4136"},
+    {file = "numpy-2.1.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:8ae0fd135e0b157365ac7cc31fff27f07a5572bdfc38f9c2d43b2aff416cc8b0"},
+    {file = "numpy-2.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:981707f6b31b59c0c24bcda52e5605f9701cb46da4b86c2e8023656ad3e833cb"},
+    {file = "numpy-2.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ca4b53e1e0b279142113b8c5eb7d7a877e967c306edc34f3b58e9be12fda8df"},
+    {file = "numpy-2.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e097507396c0be4e547ff15b13dc3866f45f3680f789c1a1301b07dadd3fbc78"},
+    {file = "numpy-2.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7506387e191fe8cdb267f912469a3cccc538ab108471291636a96a54e599556"},
+    {file = "numpy-2.1.1-cp313-cp313-win32.whl", hash = "sha256:251105b7c42abe40e3a689881e1793370cc9724ad50d64b30b358bbb3a97553b"},
+    {file = "numpy-2.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:f212d4f46b67ff604d11fff7cc62d36b3e8714edf68e44e9760e19be38c03eb0"},
+    {file = "numpy-2.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:920b0911bb2e4414c50e55bd658baeb78281a47feeb064ab40c2b66ecba85553"},
+    {file = "numpy-2.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:bab7c09454460a487e631ffc0c42057e3d8f2a9ddccd1e60c7bb8ed774992480"},
+    {file = "numpy-2.1.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:cea427d1350f3fd0d2818ce7350095c1a2ee33e30961d2f0fef48576ddbbe90f"},
+    {file = "numpy-2.1.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:e30356d530528a42eeba51420ae8bf6c6c09559051887196599d96ee5f536468"},
+    {file = "numpy-2.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8dfa9e94fc127c40979c3eacbae1e61fda4fe71d84869cc129e2721973231ef"},
+    {file = "numpy-2.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910b47a6d0635ec1bd53b88f86120a52bf56dcc27b51f18c7b4a2e2224c29f0f"},
+    {file = "numpy-2.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:13cc11c00000848702322af4de0147ced365c81d66053a67c2e962a485b3717c"},
+    {file = "numpy-2.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:53e27293b3a2b661c03f79aa51c3987492bd4641ef933e366e0f9f6c9bf257ec"},
+    {file = "numpy-2.1.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7be6a07520b88214ea85d8ac8b7d6d8a1839b0b5cb87412ac9f49fa934eb15d5"},
+    {file = "numpy-2.1.1-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:52ac2e48f5ad847cd43c4755520a2317f3380213493b9d8a4c5e37f3b87df504"},
+    {file = "numpy-2.1.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50a95ca3560a6058d6ea91d4629a83a897ee27c00630aed9d933dff191f170cd"},
+    {file = "numpy-2.1.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:99f4a9ee60eed1385a86e82288971a51e71df052ed0b2900ed30bc840c0f2e39"},
+    {file = "numpy-2.1.1.tar.gz", hash = "sha256:d0cf7d55b1051387807405b3898efafa862997b4cba8aa5dbe657be794afeafd"},
+]
+
+[[package]]
+name = "pandas"
+version = "2.2.3"
+description = "Powerful data structures for data analysis, time series, and statistics"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
+    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
+    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
+    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
+    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
+    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
+    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
+    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+]
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.7"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
+
+[[package]]
+name = "psutil"
+version = "6.0.0"
+description = "Cross-platform lib for process and system monitoring in Python."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+files = [
+    {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"},
+    {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"},
+    {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"},
+    {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"},
+    {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"},
+    {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"},
+    {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"},
+    {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"},
+    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"},
+    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"},
+    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"},
+    {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"},
+    {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"},
+    {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"},
+    {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"},
+    {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"},
+    {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"},
+]
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
+[[package]]
+name = "pyarrow"
+version = "17.0.0"
+description = "Python library for Apache Arrow"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
+]
+
+[package.dependencies]
+numpy = ">=1.16.6"
+
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+description = "Extensions to the standard Python datetime module"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+files = [
+    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
+    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
+]
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2024.2"
+description = "World timezone definitions, modern and historical"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
+    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
+]
+
+[[package]]
+name = "pywin32"
+version = "306"
+description = "Python for Window Extensions"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"},
+    {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"},
+    {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"},
+    {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"},
+    {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"},
+    {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"},
+    {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"},
+    {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"},
+    {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"},
+    {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"},
+    {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"},
+    {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"},
+    {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"},
+    {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"},
+]
+
+[[package]]
+name = "requests"
+version = "2.32.3"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+
+[[package]]
+name = "tzdata"
+version = "2024.2"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
+    {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
+]
+
+[[package]]
+name = "urllib3"
+version = "2.2.3"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
+    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+h2 = ["h2 (>=4,<5)"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+description = "A small Python utility to set file creation time on Windows"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
+    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
+]
+
+[package.extras]
+dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.11"
+content-hash = "3e5e8d72bae5534f1b40e50a87d0549c65003cef0f52a7487aea7366b7b849e9"
--- a/load_tests/pyproject.toml
+++ b/load_tests/pyproject.toml
@ -0,0 +1,19 @@
+[tool.poetry]
+name = "text-generation-inference-benchmarks"
+version = "0.1.0"
+description = ""
+authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+docker = "^7.1.0"
+loguru = "^0.7.2"
+psutil = "^6.0.0"
+gputil = "^1.4.0"
+pandas = "^2.2.3"
+pyarrow = "^17.0.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/nix/impure-shell.nix
+++ b/nix/impure-shell.nix
@ -9,6 +9,7 @@
  cudaPackages,
  openssl,
  pkg-config,
+  poetry,
  protobuf,
  python3,
  pyright,
@ -28,6 +29,7 @@ mkShell {
      black
      isort
      pkg-config
+      poetry
      (rust-bin.stable.latest.default.override {
        extensions = [
          "rust-analyzer"
--- a/nix/server.nix
+++ b/nix/server.nix
@ -3,12 +3,13 @@
  buildPythonPackage,
  poetry-core,
  mypy-protobuf,
+  attention-kernels,
  awq-inference-engine,
  causal-conv1d,
+  compressed-tensors,
  eetq,
  einops,
  exllamav2,
-  fbgemm-gpu,
  flashinfer,
  flash-attn,
  flash-attn-layer-norm,
@ -27,15 +28,19 @@
  opentelemetry-exporter-otlp,
  opentelemetry-instrumentation-grpc,
  opentelemetry-semantic-conventions,
+  outlines,
  peft,
+  pillow,
+  prometheus-client,
  punica-kernels,
+  py-cpuinfo,
+  pydantic,
  safetensors,
  tokenizers,
  torch,
  sentencepiece,
  transformers,
  typer,
-  vllm,
 }:

 let
@ -65,6 +70,7 @@ buildPythonPackage {
    "huggingface-hub"
    "loguru"
    "opentelemetry-instrumentation-grpc"
+    "pillow"
    "sentencepiece"
    "typer"
  ];
@ -72,12 +78,13 @@ buildPythonPackage {
  pythonRemoveDeps = [ "scipy" ];

  dependencies = [
+    attention-kernels
    awq-inference-engine
    eetq
    causal-conv1d
+    compressed-tensors
    einops
    exllamav2
-    fbgemm-gpu
    flashinfer
    flash-attn
    flash-attn-layer-norm
@ -95,14 +102,18 @@ buildPythonPackage {
    opentelemetry-exporter-otlp
    opentelemetry-instrumentation-grpc
    opentelemetry-semantic-conventions
+    outlines
    peft
+    pillow
+    prometheus-client
    punica-kernels
+    py-cpuinfo
+    pydantic
    safetensors
    sentencepiece
    tokenizers
    transformers
    typer
-    vllm
  ];

  prePatch = ''
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@ -272,12 +272,18 @@ message DecodeResponse {
 message WarmupRequest {
  /// Batch to warmup on
  Batch batch = 1;
-  uint32 max_input_length = 2;
+  optional uint32 max_input_tokens = 2;
  uint32 max_prefill_tokens = 3;
-  uint32 max_total_tokens = 4;
+  optional uint32 max_total_tokens = 4;
 }

 message WarmupResponse {
  /// Maximum number of tokens supported by the model
  optional uint32 max_supported_total_tokens = 1;
+  /// Maximum input tokens by clients should be equal to request value if it's set
+  /// Otherwise warmup automatically allocates a value here
+  uint32 max_input_tokens = 2;
+  /// Maximum total tokens by clients should be equal to request value if it's set
+  /// Otherwise warmup automatically allocates a value here
+  uint32 max_total_tokens = 3;
 }
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -138,13 +138,43 @@ impl Paligemma {
    }
 }

+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Qwen2VlVisionConfig {
+    pub(crate) depth: usize,
+    pub(crate) embed_dim: usize,
+    pub(crate) mlp_ratio: usize,
+    pub(crate) num_heads: usize,
+    pub(crate) in_chans: usize,
+    pub(crate) hidden_size: usize,
+    pub(crate) patch_size: usize,
+    pub(crate) spatial_merge_size: usize,
+    pub(crate) spatial_patch_size: usize,
+    pub(crate) temporal_patch_size: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Qwen2Vl {
+    pub(crate) vision_config: Qwen2VlVisionConfig,
+}
+
+impl Qwen2Vl {
+    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
+        let num_pixels = height * width;
+        num_pixels / self.vision_config.patch_size.pow(2)
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
 pub enum Config {
+    Qwen2Vl(Qwen2Vl),
    LlavaNext(LlavaNext),
    ClipVisionModel(ClipVisionModel),
    Mistral,
+    Mamba,
    Idefics,
    Mllama,
    Idefics2(Idefics2),
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@ -2,7 +2,6 @@ use crate::infer::InferError;
 use crate::{ChatTemplateInputs, Message, MessageChunk, TextMessage, TokenizerConfigToken, Tool};
 use minijinja::{Environment, ErrorKind, Template};
 use minijinja_contrib::pycompat;
-use std::collections::HashSet;

 /// Raise a exception (custom function) used in the chat templates
 pub(crate) fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
@ -15,7 +14,6 @@ pub(crate) struct ChatTemplate {
    bos_token: Option<String>,
    eos_token: Option<String>,
    use_default_tool_template: bool,
-    variables: HashSet<String>,
 }

 impl ChatTemplate {
@ -47,21 +45,14 @@ impl ChatTemplate {
            bos_token: bos_token.map(|token| token.as_str().to_string()),
            eos_token: eos_token.map(|token| token.as_str().to_string()),
            use_default_tool_template,
-            variables,
        }
    }

    pub(crate) fn apply(
        &self,
-        guideline: Option<&str>,
        mut messages: Vec<Message>,
        tools_and_prompt: Option<(Vec<Tool>, String)>,
    ) -> Result<String, InferError> {
-        // check if guideline is expected but not provided
-        if self.variables.contains("guideline") && guideline.is_none() {
-            return Err(InferError::MissingTemplateVariable("guideline".to_string()));
-        }
-
        let tools = match tools_and_prompt {
            Some((tools, tool_prompt)) => {
                // check if the `tools` variable is used in the template
@ -87,7 +78,6 @@ impl ChatTemplate {

        self.template
            .render(ChatTemplateInputs {
-                guideline,
                messages,
                bos_token: self.bos_token.as_deref(),
                eos_token: self.eos_token.as_deref(),
@ -756,19 +746,6 @@ mod tests {
                },
                target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
            },
-            ChatTemplateTestItem {
-                name: "google/shieldgemma-9b",
-                chat_template: "{{- bos_token }}\n{%- if messages[-1].role == 'user' %}\n    {{- \"You are a policy expert trying to help determine whether a user prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the human question violate the above principle? Your answer must start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- else %}\n    {{- \"You are a policy expert trying to help determine whether an AI response to a prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-2].content }}\n    {{- \"\\n<end_of_turn>\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Chatbot Response: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the Chatbot Response violate the above principle? Your answer must  start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- endif %}\n\n",
-                input: ChatTemplateInputs {
-                    messages: example_chat_with_system.clone(),
-                    add_generation_prompt: false,
-                    bos_token: Some("<s>"),
-                    eos_token: Some("</s>"),
-                    guideline: Some("Do not use offensive language."),
-                    ..Default::default()
-                },
-                target: "<s>You are a policy expert trying to help determine whether a user prompt is in violation of the defined safety policies.\n\n<start_of_turn>\nHuman Question: I'd like to show off how chat templating works!\n<end_of_turn>\n\nOur safety principle is defined in the below:\n\n* Do not use offensive language.\n\n===\n\nDoes the human question violate the above principle? Your answer must start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\n\n",
-            },
        ];

        #[allow(unused_variables)] // name is unused
@ -794,47 +771,6 @@ mod tests {
        }
    }

-    #[test]
-    fn test_chat_template_invalid_with_guideline() {
-        let ct = ChatTemplate::new(
-            "{{- bos_token }}\n{%- if messages[-1].role == 'user' %}\n    {{- \"You are a policy expert trying to help determine whether a user prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the human question violate the above principle? Your answer must start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- else %}\n    {{- \"You are a policy expert trying to help determine whether an AI response to a prompt is in violation of the defined safety policies.\\n\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Human Question: \" + messages[-2].content }}\n    {{- \"\\n<end_of_turn>\\n\" }}\n    {{- \"<start_of_turn>\\n\" }}\n    {{- \"Chatbot Response: \" + messages[-1].content }}\n    {{- \"\\n<end_of_turn>\\n\\n\" }}\n    {{- \"Our safety principle is defined in the below:\\n\\n\" }}\n    {{- \"* \" + guideline + \"\\n\" }}\n    {{- \"\\n===\\n\\n\" }}\n    {{- \"Does the Chatbot Response violate the above principle? Your answer must  start with 'Yes' or 'No'. And then walk through step by step to be sure we answer correctly.\\n\\n\" }}\n{%- endif %}\n\n".to_string(),
-            Some(TokenizerConfigToken::String("<s>".to_string())),
-            Some(TokenizerConfigToken::String("</s>".to_string())),
-        );
-
-        // convert TextMessage to Message
-        let msgs: Vec<Message> = vec![
-            Message {
-                name: None,
-                role: "user".to_string(),
-                content: MessageContent::SingleText(
-                    "I'd like to show off how chat templating works!".to_string(),
-                ),
-            },
-            Message {
-                name: None,
-                role: "assistant".to_string(),
-                content: MessageContent::SingleText(
-                    "I'm doing great. How can I help you today?".to_string(),
-                ),
-            },
-            Message {
-                name: None,
-                role: "user".to_string(),
-                content: MessageContent::SingleText("Hello, how are you?".to_string()),
-            },
-        ];
-
-        let result = ct.apply(None, msgs, None);
-
-        match result {
-            Ok(_) => panic!("Should have failed since no guideline is provided"),
-            Err(e) => {
-                assert_eq!(e.to_string(), "Missing template vatiable: guideline")
-            }
-        }
-    }
-
    #[test]
    fn test_chat_template_with_default_tool_template() {
        let ct = ChatTemplate::new(
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -10,10 +10,12 @@ use crate::{
 };
 use async_stream::stream;
 use async_trait::async_trait;
+use axum::response::sse::Event;
 use chat_template::ChatTemplate;
 use futures::future::try_join_all;
 use futures::Stream;
 use minijinja::ErrorKind;
+use serde::Serialize;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use thiserror::Error;
@ -135,7 +137,7 @@ impl Infer {
    pub(crate) async fn tokenize(
        &self,
        request: GenerateRequest,
-    ) -> Result<Option<tokenizers::Encoding>, InferError> {
+    ) -> Result<tokenizers::Encoding, InferError> {
        // Tokenize request
        let inputs = request.inputs;
        let add_special_tokens = request.add_special_tokens;
@ -150,21 +152,20 @@ impl Infer {
            })?;

        // Return Encoding
-        Ok(encoding.map(|(encoding, _)| encoding))
+        Ok(encoding.0)
    }

    /// Apply the chat template to the chat request
    #[instrument(skip_all)]
    pub(crate) fn apply_chat_template(
        &self,
-        guideline: Option<String>,
        messages: Vec<Message>,
        tools_and_prompt: Option<(Vec<Tool>, String)>,
    ) -> Result<String, InferError> {
        self.chat_template
            .as_ref()
            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
-            .apply(guideline.as_deref(), messages, tools_and_prompt)
+            .apply(messages, tools_and_prompt)
            .map_err(|e| {
                metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
                tracing::error!("{e}");
@ -373,4 +374,26 @@ impl InferError {
            InferError::StreamSerializationError(_) => "stream_serialization_error",
        }
    }
+
+    pub(crate) fn into_openai_event(self) -> Event {
+        Event::default()
+            .json_data(OpenaiErrorEvent {
+                error: APIError {
+                    message: self.to_string(),
+                    http_status_code: 422,
+                },
+            })
+            .unwrap()
+    }
+}
+
+#[derive(Serialize)]
+pub struct APIError {
+    message: String,
+    http_status_code: usize,
+}
+
+#[derive(Serialize)]
+pub struct OpenaiErrorEvent {
+    error: APIError,
 }
--- a/router/src/infer/tool_grammar.rs
+++ b/router/src/infer/tool_grammar.rs
@ -1,7 +1,6 @@
 use crate::infer::InferError;
 use crate::{
    FunctionDefinition, FunctionRef, FunctionsMap, JsonSchemaTool, Properties, Tool, ToolChoice,
-    ToolType,
 };
 use serde_json::{json, Map, Value};
 use std::collections::HashMap;
@ -21,22 +20,24 @@ impl ToolGrammar {
    pub fn apply(
        tools: Vec<Tool>,
        tool_choice: ToolChoice,
-    ) -> Result<(Vec<Tool>, Option<JsonSchemaTool>), InferError> {
-        // if no tools are provided, we return None
-        if tools.is_empty() {
-            return Ok((tools, None));
+    ) -> Result<Option<(Vec<Tool>, JsonSchemaTool)>, InferError> {
+        let tools_to_use = match tool_choice {
+            ToolChoice::Function(function) => {
+                vec![Self::find_tool_by_name(&tools, &function.name)?]
            }
-
-        let tool_choice = tool_choice.0.unwrap_or(ToolType::OneOf);
-
-        let mut tools = tools.clone();
-
-        // add the no_tool function to the tools
-        let no_tool = Tool {
+            ToolChoice::Required => tools,
+            ToolChoice::Auto => {
+                // only add the no_tool function if the user has selected the auto option
+                tools
+                    .iter()
+                    .cloned()
+                    .chain(std::iter::once(Tool {
                        r#type: "function".to_string(),
                        function: FunctionDefinition {
                            name: "no_tool".to_string(),
-                description: Some("Open ened response with no specific tool selected".to_string()),
+                            description: Some(
+                                "Open ended response with no specific tool selected".to_string(),
+                            ),
                            arguments: json!({
                                "type": "object",
                                "properties": {
@ -48,18 +49,17 @@ impl ToolGrammar {
                                "required": ["content"]
                            }),
                        },
-        };
-        tools.push(no_tool);
-
-        // if tools are provided and no tool_choice we default to the OneOf
-        let tools_to_use = match tool_choice {
-            ToolType::Function(function) => {
-                vec![Self::find_tool_by_name(&tools, &function.name)?]
+                    }))
+                    .collect::<Vec<_>>()
            }
-            ToolType::OneOf => tools.clone(),
-            ToolType::NoTool => return Ok((tools, None)),
+            ToolChoice::NoTool => vec![],
        };

+        // if no tools are provided or if the user has selected the no_tool option, return None
+        if tools_to_use.is_empty() {
+            return Ok(None);
+        }
+
        let functions: HashMap<String, serde_json::Value> = tools_to_use
            .iter()
            .map(|tool| {
@ -118,6 +118,6 @@ impl ToolGrammar {
            },
        };

-        Ok((tools, Some(tool_schema)))
+        Ok(Some((tools_to_use, tool_schema)))
    }
 }
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -12,13 +12,101 @@ mod sagemaker;
 pub mod usage_stats;
 mod vertex;

+use crate::infer::tool_grammar::ToolGrammar;
 use crate::infer::{Infer, InferError};
-use crate::server::prepare_chat_input;
+use pyo3::prelude::*;
+use pyo3::types::IntoPyDict;
 use serde::{Deserialize, Serialize};
+use tokenizers::Encoding;
 use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;

+#[allow(clippy::large_enum_variant)]
+#[derive(Clone)]
+pub enum Tokenizer {
+    Python {
+        tokenizer_name: String,
+        revision: Option<String>,
+        trust_remote_code: bool,
+    },
+    Rust(tokenizers::Tokenizer),
+}
+
+pub struct PyTokenizer<'a>(pyo3::Bound<'a, pyo3::PyAny>);
+
+impl<'a> PyTokenizer<'a> {
+    fn from_py(
+        py: Python<'a>,
+        tokenizer_name: String,
+        revision: Option<String>,
+        trust_remote_code: bool,
+    ) -> PyResult<PyTokenizer<'a>> {
+        let transformers = py.import_bound("transformers")?;
+        let auto = transformers.getattr("AutoTokenizer")?;
+        let from_pretrained = auto.getattr("from_pretrained")?;
+        let args = (tokenizer_name,);
+        let kwargs = if let Some(rev) = &revision {
+            [
+                ("revision", rev.to_string().into_py(py)),
+                ("trust_remote_code", trust_remote_code.into_py(py)),
+            ]
+            .into_py_dict_bound(py)
+        } else {
+            [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
+        };
+        let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
+        tracing::info!("Loaded a python tokenizer");
+        Ok(PyTokenizer(tokenizer))
+    }
+}
+
+trait TokenizerTrait {
+    fn encode_trait(
+        &self,
+        query: String,
+        add_special_tokens: bool,
+    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>>;
+}
+
+impl TokenizerTrait for tokenizers::Tokenizer {
+    fn encode_trait(
+        &self,
+        query: String,
+        add_special_tokens: bool,
+    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>> {
+        self.encode(query, add_special_tokens)
+    }
+}
+
+impl<'a> TokenizerTrait for PyTokenizer<'a> {
+    fn encode_trait(
+        &self,
+        query: String,
+        add_special_tokens: bool,
+    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>> {
+        let py = self.0.py();
+        let kwargs = [
+            ("text", query.into_py(py)),
+            ("add_special_tokens", add_special_tokens.into_py(py)),
+        ]
+        .into_py_dict_bound(py);
+        let encode = self.0.getattr("encode")?;
+        let input_ids: Vec<u32> = encode.call((), Some(&kwargs))?.extract()?;
+        Ok(Encoding::new(
+            input_ids,
+            vec![],                           // type ids
+            vec![],                           // tokens (strings)
+            vec![],                           // words
+            vec![],                           // offsets
+            vec![],                           // special_tokens_mask
+            vec![],                           // attention_mask
+            vec![],                           // overflowing
+            std::collections::HashMap::new(), //sequence_ranges
+        ))
+    }
+}
+
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
 pub struct HubModelInfo {
@ -811,7 +899,7 @@ pub(crate) struct ChatRequest {

    /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
    #[serde(default)]
-    #[schema(nullable = true, example = "null")]
+    #[schema(nullable = true, default = "auto", example = "auto")]
    pub tool_choice: ToolChoice,

    /// Response format constraints for the generation.
@ -821,11 +909,6 @@ pub(crate) struct ChatRequest {
    #[schema(nullable = true, default = "null", example = "null")]
    pub response_format: Option<GrammarType>,

-    /// A guideline to be used in the chat_template
-    #[serde(default)]
-    #[schema(nullable = true, default = "null", example = "null")]
-    pub guideline: Option<String>,
-
    /// Options for streaming response. Only set this when you set stream: true.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
@ -846,7 +929,6 @@ impl ChatRequest {
            tool_prompt,
            temperature,
            response_format,
-            guideline,
            presence_penalty,
            frequency_penalty,
            top_p,
@ -865,15 +947,42 @@ impl ChatRequest {
            Some(temperature) if temperature == 0.0 => (false, None),
            other => (true, other),
        };
-        let (inputs, grammar, using_tools) = prepare_chat_input(
-            infer,
-            response_format,
-            tools,
-            tool_choice,
-            &tool_prompt,
-            guideline,
+
+        if response_format.is_some() && tools.is_some() {
+            return Err(InferError::ToolError(
+                "Grammar and tools are mutually exclusive".into(),
+            ));
+        }
+
+        let (inputs, grammar, using_tools) = match response_format {
+            Some(format) => {
+                let inputs = infer.apply_chat_template(messages, None)?;
+                (inputs, Some(format), false)
+            }
+            None => {
+                if let Some(tools) = tools {
+                    match ToolGrammar::apply(tools, tool_choice)? {
+                        Some((updated_tools, tool_schema)) => {
+                            let grammar = GrammarType::Json(serde_json::json!(tool_schema));
+                            let inputs: String = infer.apply_chat_template(
                                messages,
+                                Some((updated_tools, tool_prompt)),
                            )?;
+                            (inputs, Some(grammar), true)
+                        }
+                        None => {
+                            // same as if no response_format or tools are set
+                            let inputs = infer.apply_chat_template(messages, None)?;
+                            (inputs, None, false)
+                        }
+                    }
+                } else {
+                    // if no response_format or tools are set simply apply the chat template to generate inputs
+                    let inputs = infer.apply_chat_template(messages, None)?;
+                    (inputs, None, false)
+                }
+            }
+        };

        Ok((
            GenerateRequest {
@ -918,19 +1027,11 @@ pub fn default_tool_prompt() -> String {
    "\nGiven the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.\n".to_string()
 }

-#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, ToSchema)]
-#[schema(example = "auto")]
-/// Controls which (if any) tool is called by the model.
-pub enum ToolType {
-    /// Means the model can pick between generating a message or calling one or more tools.
-    #[schema(rename = "auto")]
-    OneOf,
-    /// Means the model will not call any tool and instead generates a message.
-    #[schema(rename = "none")]
-    NoTool,
-    /// Forces the model to call a specific tool.
-    #[schema(rename = "function")]
-    Function(FunctionName),
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+#[serde(tag = "type")]
+pub enum TypedChoice {
+    #[serde(rename = "function")]
+    Function { function: FunctionName },
 }

 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, ToSchema)]
@ -938,28 +1039,58 @@ pub struct FunctionName {
    pub name: String,
 }

-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default, ToSchema)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, ToSchema, Default)]
 #[serde(from = "ToolTypeDeserializer")]
-pub struct ToolChoice(pub Option<ToolType>);
+#[serde(rename_all = "snake_case")]
+/// <https://platform.openai.com/docs/guides/function-calling/configuring-function-calling-behavior-using-the-tool_choice-parameter>
+pub enum ToolChoice {
+    /// Means the model can pick between generating a message or calling one or more tools.
+    #[default]
+    Auto,
+    /// Means the model will not call any tool and instead generates a message.
+    #[serde(rename = "none")]
+    NoTool,
+    /// Means the model must call one or more tools.
+    Required,
+    /// Forces the model to call a specific tool. This structure aligns with the `OpenAI` API schema to force a specific tool.
+    Function(FunctionName),
+}

-#[derive(Deserialize)]
+#[derive(Deserialize, ToSchema)]
 #[serde(untagged)]
+/// Controls which (if any) tool is called by the model.
+/// - `none` means the model will not call any tool and instead generates a message.
+/// - `auto` means the model can pick between generating a message or calling one or more tools.
+/// - `required` means the model must call one or more tools.
+/// - Specifying a particular tool via `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}` forces the model to call that tool.
+///
+/// `none` is the default when no tools are present. `auto` is the default if tools are present."
 enum ToolTypeDeserializer {
+    /// None means `null` was passed in the JSON, and the default choice is applied based on the presence of tools.
    Null,
+
+    /// `auto` means the model can pick between generating a message or calling one or more tools.
+    #[schema(example = "auto")]
    String(String),
-    ToolType(ToolType),
+
+    /// Specifying a particular tool forces the model to call that tool, with structured function details.
+    #[schema(example = r#"{"type": "function", "function": {"name": "my_function"}}"#)]
+    TypedChoice(TypedChoice),
 }

 impl From<ToolTypeDeserializer> for ToolChoice {
    fn from(value: ToolTypeDeserializer) -> Self {
        match value {
-            ToolTypeDeserializer::Null => ToolChoice(None),
+            ToolTypeDeserializer::Null => ToolChoice::Auto,
            ToolTypeDeserializer::String(s) => match s.as_str() {
-                "none" => ToolChoice(Some(ToolType::NoTool)),
-                "auto" => ToolChoice(Some(ToolType::OneOf)),
-                _ => ToolChoice(Some(ToolType::Function(FunctionName { name: s }))),
+                "none" => ToolChoice::NoTool,
+                "auto" => ToolChoice::Auto,
+                "required" => ToolChoice::Required,
+                _ => ToolChoice::Function(FunctionName { name: s }),
            },
-            ToolTypeDeserializer::ToolType(tool_type) => ToolChoice(Some(tool_type)),
+            ToolTypeDeserializer::TypedChoice(TypedChoice::Function { function }) => {
+                ToolChoice::Function(function)
+            }
        }
    }
 }
@ -1025,7 +1156,6 @@ pub(crate) struct ChatTemplateInputs<'a> {
    eos_token: Option<&'a str>,
    add_generation_prompt: bool,
    tools: Option<Vec<Tool>>,
-    guideline: Option<&'a str>,
 }

 #[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
@ -1125,6 +1255,7 @@ pub(crate) enum OutputMessage {
 }

 #[derive(Clone, Debug, Deserialize, ToSchema)]
+#[cfg_attr(test, derive(PartialEq))]
 pub(crate) struct GenerateRequest {
    #[schema(example = "My name is Olivier and I")]
    pub inputs: String,
@ -1341,13 +1472,12 @@ impl Default for ModelsInfo {
 mod tests {
    use super::*;
    use serde_json::json;
-    use tokenizers::Tokenizer;

-    pub(crate) async fn get_tokenizer() -> Tokenizer {
+    pub(crate) fn get_tokenizer() -> Tokenizer {
        let api = hf_hub::api::sync::Api::new().unwrap();
        let repo = api.model("gpt2".to_string());
        let filename = repo.get("tokenizer.json").unwrap();
-        Tokenizer::from_file(filename).unwrap()
+        Tokenizer::Rust(tokenizers::Tokenizer::from_file(filename).unwrap())
    }

    #[test]
@ -1566,4 +1696,41 @@ mod tests {
            r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"#
        );
    }
+
+    #[test]
+    fn tool_choice_formats() {
+        #[derive(Deserialize)]
+        struct TestRequest {
+            tool_choice: ToolChoice,
+        }
+
+        let de_none: TestRequest = serde_json::from_str(r#"{"tool_choice":"none"}"#).unwrap();
+        assert_eq!(de_none.tool_choice, ToolChoice::NoTool);
+
+        let de_auto: TestRequest = serde_json::from_str(r#"{"tool_choice":"auto"}"#).unwrap();
+        assert_eq!(de_auto.tool_choice, ToolChoice::Auto);
+
+        let de_required: TestRequest =
+            serde_json::from_str(r#"{"tool_choice":"required"}"#).unwrap();
+        assert_eq!(de_required.tool_choice, ToolChoice::Required);
+
+        let de_named: TestRequest = serde_json::from_str(r#"{"tool_choice":"myfn"}"#).unwrap();
+        assert_eq!(
+            de_named.tool_choice,
+            ToolChoice::Function(FunctionName {
+                name: "myfn".to_string(),
+            })
+        );
+
+        let de_openai_named: TestRequest = serde_json::from_str(
+            r#"{"tool_choice":{"type":"function","function":{"name":"myfn"}}}"#,
+        )
+        .unwrap();
+        assert_eq!(
+            de_openai_named.tool_choice,
+            ToolChoice::Function(FunctionName {
+                name: "myfn".to_string(),
+            })
+        );
+    }
 }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1,6 +1,5 @@
 /// HTTP Server logic
 use crate::config::Config;
-use crate::infer::tool_grammar::ToolGrammar;
 use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse};
 #[cfg(feature = "kserve")]
 use crate::kserve::{
@ -20,7 +19,8 @@ use crate::{
    GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
    HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
    OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
-    TextMessage, Token, TokenizeResponse, ToolCallDelta, ToolCallMessage, Url, Usage, Validation,
+    TextMessage, Token, TokenizeResponse, Tokenizer, ToolCallDelta, ToolCallMessage, Url, Usage,
+    Validation,
 };
 use crate::{
    ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
@ -28,10 +28,10 @@ use crate::{
    ChatRequest, Chunk, CompatGenerateRequest, Completion, CompletionComplete, CompletionFinal,
    CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool,
 };
-use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolChoice, ToolType};
+use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolChoice};
 use crate::{ModelInfo, ModelsInfo};
 use async_stream::__private::AsyncStream;
-use axum::extract::Extension;
+use axum::extract::{DefaultBodyLimit, Extension};
 use axum::http::{HeaderMap, HeaderValue, Method, StatusCode};
 use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Response};
@ -46,6 +46,7 @@ use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
 use hf_hub::{Cache, Repo, RepoType};
 use http::header::AUTHORIZATION;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
+use pyo3::prelude::*;
 use pyo3::types::IntoPyDict;
 use regex::Regex;
 use serde_json::Value;
@ -55,7 +56,6 @@ use std::io::BufReader;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::{Path, PathBuf};
 use thiserror::Error;
-use tokenizers::Tokenizer;
 use tokio::select;
 use tokio::signal;
 use tokio::sync::oneshot;
@ -66,6 +66,41 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;
 use utoipa::OpenApi;
 use utoipa_swagger_ui::SwaggerUi;

+fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<SimpleToken> {
+    let offsets = encoding.get_offsets();
+    let input_ids = encoding.get_ids();
+    if offsets.len() == input_ids.len() {
+        input_ids
+            .iter()
+            .zip(offsets)
+            .map(|(&id, &(start, stop))| {
+                let text = input
+                    .chars()
+                    .skip(start)
+                    .take(stop - start)
+                    .collect::<String>();
+                SimpleToken {
+                    id,
+                    text,
+                    start,
+                    stop,
+                }
+            })
+            .collect()
+    } else {
+        encoding
+            .get_ids()
+            .iter()
+            .map(|&id| SimpleToken {
+                id,
+                text: "".to_string(),
+                start: 0,
+                stop: 0,
+            })
+            .collect()
+    }
+}
+
 /// Generate tokens if `stream == false` or a stream of token if `stream == true`
 #[utoipa::path(
 post,
@ -75,7 +110,7 @@ request_body = CompatGenerateRequest,
 responses(
 (status = 200, description = "Generated Text",
 content(
-("application/json" = GenerateResponse),
+("application/json" = Vec<GenerateResponse>),
 ("text/event-stream" = StreamResponse),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
@ -151,12 +186,16 @@ async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
    })
 }

+/// Template and tokenize ChatRequest
 #[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/chat_tokenize",
    request_body = ChatRequest,
-    responses((status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse))
+    responses(
+    (status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse),
+    (status = 404, description = "Failed to tokenize ChatRequest", body = ErrorResponse),
+    )
 )]
 async fn get_chat_tokenize(
    Extension(infer): Extension<Infer>,
@ -167,40 +206,14 @@ async fn get_chat_tokenize(
    let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
    let input = generate_request.inputs.clone();
    let encoding = infer.tokenize(generate_request).await?;
-    if let Some(encoding) = encoding {
-        let tokens: Vec<SimpleToken> = encoding
-            .get_ids()
-            .iter()
-            .zip(encoding.get_offsets())
-            .map(|(&id, &(start, stop))| {
-                let text = input
-                    .chars()
-                    .skip(start)
-                    .take(stop - start)
-                    .collect::<String>();
-                SimpleToken {
-                    id,
-                    text,
-                    start,
-                    stop,
-                }
-            })
-            .collect();
+
+    let tokens = encoding_to_tokens(&encoding, &input);

    let resp = ChatTokenizeResponse {
        tokenize_response: TokenizeResponse(tokens),
        templated_text: input,
    };
    Ok((HeaderMap::new(), Json(resp)))
-    } else {
-        Err((
-            StatusCode::NOT_FOUND,
-            Json(ErrorResponse {
-                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
-                error_type: "no fast tokenizer".to_string(),
-            }),
-        ))
-    }
 }

 #[utoipa::path(
@ -873,7 +886,7 @@ pub(crate) async fn completions(

                                    yield Ok(event);
                                }
-                                Err(err) => yield Ok(Event::from(err)),
+                                Err(err) => yield Ok(err.into_openai_event()),
                            }
                        }
                    };
@ -1286,7 +1299,8 @@ pub(crate) async fn chat_completions(
            };
            let mut response_as_tool = using_tools;
            while let Some(result) = response_stream.next().await {
-                if let Ok(stream_token) = result {
+                match result{
+                Ok(stream_token) => {
                    let token_text = &stream_token.token.text.clone();
                    match state {
                        StreamState::Buffering => {
@ -1380,6 +1394,8 @@ pub(crate) async fn chat_completions(
                        }
                    }
                }
+                Err(err) => yield Ok(err.into_openai_event())
+                }
            }
            yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
        };
@ -1484,35 +1500,8 @@ async fn tokenize(
 ) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
    let input = req.inputs.clone();
    let encoding = infer.tokenize(req).await?;
-    if let Some(encoding) = encoding {
-        let tokens: Vec<SimpleToken> = encoding
-            .get_ids()
-            .iter()
-            .zip(encoding.get_offsets())
-            .map(|(&id, &(start, stop))| {
-                let text = input
-                    .chars()
-                    .skip(start)
-                    .take(stop - start)
-                    .collect::<String>();
-                SimpleToken {
-                    id,
-                    text,
-                    start,
-                    stop,
-                }
-            })
-            .collect();
+    let tokens = encoding_to_tokens(&encoding, &input);
    Ok(Json(TokenizeResponse(tokens)))
-    } else {
-        Err((
-            StatusCode::NOT_FOUND,
-            Json(ErrorResponse {
-                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
-                error_type: "no fast tokenizer".to_string(),
-            }),
-        ))
-    }
 }

 /// Prometheus metrics scrape endpoint
@ -1544,6 +1533,7 @@ tokenize,
 metrics,
 openai_get_model_info,
 sagemaker_compatibility,
+get_chat_tokenize,
 ),
 components(
 schemas(
@ -1594,13 +1584,13 @@ GrammarType,
 Usage,
 StreamOptions,
 DeltaToolCall,
-ToolType,
 Tool,
 ToolCall,
 Function,
 FunctionDefinition,
 ToolChoice,
 ModelInfo,
+ChatTokenizeResponse,
 )
 ),
 tags(
@ -1620,6 +1610,71 @@ pub fn schema() -> ApiDoc {
    ApiDoc
 }

+fn py_resolve_tokenizer(
+    py: pyo3::Python,
+    tokenizer_name: &str,
+    revision: Option<&str>,
+    trust_remote_code: bool,
+) -> pyo3::PyResult<()> {
+    let transformers = py.import_bound("transformers")?;
+    let auto = transformers.getattr("AutoTokenizer")?;
+    let from_pretrained = auto.getattr("from_pretrained")?;
+    let args = (tokenizer_name,);
+    let kwargs = if let Some(rev) = &revision {
+        [
+            ("revision", rev.to_string().into_py(py)),
+            ("trust_remote_code", trust_remote_code.into_py(py)),
+        ]
+        .into_py_dict_bound(py)
+    } else {
+        [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
+    };
+    let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
+    let save = tokenizer.getattr("save_pretrained")?;
+    let args = ("out".to_string(),);
+    save.call1(args)?;
+    Ok(())
+}
+
+fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
+    // XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
+    // and state-spaces/mamba-130m
+    tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");
+
+    #[derive(serde::Deserialize)]
+    struct FallbackConfig {
+        base_model_name_or_path: Option<String>,
+        model_type: Option<String>,
+        ssm_config: Option<serde_json::Value>,
+    }
+    config_filename.and_then(|filename| {
+        std::fs::read_to_string(filename)
+            .ok()
+            .as_ref()
+            .and_then(|c| {
+                let config: Result<FallbackConfig, _> = serde_json::from_str(c);
+                if let Ok(config) = config {
+                    if config.model_type.is_none() {
+                        if let Some(base) = config.base_model_name_or_path {
+                            pyo3::Python::with_gil(|py| -> PyResult<()> {
+                                py_resolve_tokenizer(py, &base, Some("main"), false)
+                            })
+                            .ok()?;
+                        }
+                    }
+                    if config.ssm_config.is_some() {
+                        // XXX Legacy mamba
+                        pyo3::Python::with_gil(|py| -> PyResult<()> {
+                            py_resolve_tokenizer(py, "EleutherAI/gpt-neox-20b", Some("main"), false)
+                        })
+                        .ok()?;
+                    }
+                }
+                Some(())
+            })
+    })
+}
+
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
@ -1645,6 +1700,7 @@ pub async fn run(
    disable_grammar_support: bool,
    max_client_batch_size: usize,
    usage_stats_level: usage_stats::UsageStatsLevel,
+    payload_limit: usize,
 ) -> Result<(), WebServerError> {
    // CORS allowed origins
    // map to go inside the option and then map to parse from String to HeaderValue
@ -1713,7 +1769,6 @@ pub async fn run(

    // Load tokenizer and model info
    let (
-        tokenizer_filename,
        config_filename,
        tokenizer_config_filename,
        preprocessor_config_filename,
@ -1721,7 +1776,6 @@ pub async fn run(
        model_info,
    ) = match api {
        Type::None => (
-            Some(local_path.join("tokenizer.json")),
            Some(local_path.join("config.json")),
            Some(local_path.join("tokenizer_config.json")),
            Some(local_path.join("preprocessor_config.json")),
@ -1735,10 +1789,6 @@ pub async fn run(
                revision.clone().unwrap_or_else(|| "main".to_string()),
            ));

-            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
-                Ok(tokenizer_filename) => Some(tokenizer_filename),
-                Err(_) => get_base_tokenizer(&api, &api_repo).await,
-            };
            let config_filename = api_repo.get("config.json").await.ok();
            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
@ -1751,7 +1801,6 @@ pub async fn run(
                None
            };
            (
-                tokenizer_filename,
                config_filename,
                tokenizer_config_filename,
                preprocessor_config_filename,
@ -1766,7 +1815,6 @@ pub async fn run(
                revision.clone().unwrap_or_else(|| "main".to_string()),
            ));
            (
-                repo.get("tokenizer.json"),
                repo.get("config.json"),
                repo.get("tokenizer_config.json"),
                repo.get("preprocessor_config.json"),
@ -1788,39 +1836,31 @@ pub async fn run(
        HubTokenizerConfig::default()
    });

-    let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
+    let tokenizer: Tokenizer = {
        use pyo3::prelude::*;
-        let convert = pyo3::Python::with_gil(|py| -> PyResult<()> {
-            let transformers = py.import_bound("transformers")?;
-            let auto = transformers.getattr("AutoTokenizer")?;
-            let from_pretrained = auto.getattr("from_pretrained")?;
-            let args = (tokenizer_name.to_string(),);
-            let kwargs = [
-                (
-                    "revision",
-                    (revision.clone().unwrap_or_else(|| "main".to_string())).into_py(py),
-                ),
-                ("trust_remote_code", trust_remote_code.into_py(py)),
-            ]
-            .into_py_dict_bound(py);
-            let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
-            let save = tokenizer.getattr("save_pretrained")?;
-            let args = ("out".to_string(),);
-            save.call1(args)?;
+        pyo3::Python::with_gil(|py| -> PyResult<()> {
+            py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), trust_remote_code)?;
            Ok(())
        })
        .inspect_err(|err| {
            tracing::error!("Failed to import python tokenizer {err}");
-        });
-        let filename = if convert.is_ok() {
-            // If we have correctly loaded and resaved with transformers
-            // We might have modified the tokenizer.json according to transformers
-            "out/tokenizer.json".into()
+        })
+        .or_else(|err| {
+            let out = legacy_tokenizer_handle(config_filename.as_ref());
+            out.ok_or(err)
+        })
+        .expect("We cannot load a tokenizer");
+        let filename = "out/tokenizer.json";
+        if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
+            Tokenizer::Rust(tok)
        } else {
-            filename
+            Tokenizer::Python {
+                tokenizer_name: tokenizer_name.clone(),
+                revision: revision.clone(),
+                trust_remote_code,
+            }
+        }
    };
-        Tokenizer::from_file(filename).ok()
-    });

    let config: Option<Config> = config_filename.and_then(|filename| {
        std::fs::read_to_string(filename)
@ -1848,10 +1888,6 @@ pub async fn run(
        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);

    tracing::info!("Using config {config:?}");
-    if tokenizer.is_none() {
-        tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
-        tracing::warn!("Rust input length validation and truncation is disabled");
-    }

    // Only send usage stats when TGI is run in container and the function returns Some
    let is_container = matches!(usage_stats::is_container(), Ok(true));
@ -1919,6 +1955,7 @@ pub async fn run(
        model_info,
        compat_return_full_text,
        allow_origin,
+        payload_limit,
    )
    .await;

@ -1966,7 +2003,7 @@ async fn start(
    validation_workers: usize,
    api_key: Option<String>,
    config: Option<Config>,
-    (tokenizer, tokenizer_config): (Option<Tokenizer>, HubTokenizerConfig),
+    (tokenizer, tokenizer_config): (Tokenizer, HubTokenizerConfig),
    (preprocessor_config, processor_config): (Option<HubPreprocessorConfig>, HubProcessorConfig),
    hostname: String,
    port: u16,
@ -1978,6 +2015,7 @@ async fn start(
    model_info: HubModelInfo,
    compat_return_full_text: bool,
    allow_origin: Option<AllowOrigin>,
+    payload_limit: usize,
 ) -> Result<(), WebServerError> {
    // Determine the server port based on the feature and environment variable.
    let port = if cfg!(feature = "google") {
@ -2375,6 +2413,7 @@ async fn start(
        .layer(Extension(compute_type))
        .layer(Extension(prom_handle.clone()))
        .layer(OtelAxumLayer::default())
+        .layer(DefaultBodyLimit::max(payload_limit))
        .layer(axum::middleware::from_fn(trace_context_middleware))
        .layer(cors_layer);

@ -2427,30 +2466,6 @@ pub async fn get_hub_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
    }
 }

-/// get base tokenizer
-pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
-    let config_filename = api_repo.get("config.json").await.ok()?;
-
-    // Open the file in read-only mode with buffer.
-    let file = File::open(config_filename).ok()?;
-    let reader = BufReader::new(file);
-
-    // Read the JSON contents of the file as an instance of `User`.
-    let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
-
-    if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
-        let api_base_repo = api.repo(Repo::with_revision(
-            base_model_id.to_string(),
-            RepoType::Model,
-            "main".to_string(),
-        ));
-
-        api_base_repo.get("tokenizer.json").await.ok()
-    } else {
-        None
-    }
-}
-
 /// get tokenizer_config from the Huggingface Hub
 pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
@ -2539,157 +2554,3 @@ pub enum WebServerError {
    #[error("Axum error: {0}")]
    Axum(#[from] axum::BoxError),
 }
-
-type PreparedInput = (String, Option<GrammarType>, bool);
-
-pub(crate) fn prepare_chat_input(
-    infer: &Infer,
-    response_format: Option<GrammarType>,
-    tools: Option<Vec<Tool>>,
-    tool_choice: ToolChoice,
-    tool_prompt: &str,
-    guideline: Option<String>,
-    messages: Vec<Message>,
-) -> Result<PreparedInput, InferError> {
-    if response_format.is_some() && tools.is_some() {
-        return Err(InferError::ToolError(
-            "Grammar and tools are mutually exclusive".into(),
-        ));
-    }
-
-    // when response_format is set, tools are not included when applying the chat template to generate inputs
-    if let Some(format) = response_format {
-        let inputs = infer.apply_chat_template(guideline, messages, None)?;
-        return Ok((inputs, Some(format), false));
-    }
-
-    // when no response_format is set and tools are included, apply the chat template with the tools
-    // to generate inputs
-    if let Some(tools) = tools {
-        let (updated_tools, tool_schema) = ToolGrammar::apply(tools, tool_choice)?;
-
-        let grammar = tool_schema
-            .as_ref()
-            .map(|t| GrammarType::Json(serde_json::json!(t)));
-
-        let inputs: String = infer.apply_chat_template(
-            guideline,
-            messages,
-            Some((updated_tools, tool_prompt.into())),
-        )?;
-        return Ok((inputs, grammar, tool_schema.is_some()));
-    }
-
-    // if no response_format or tools are set simply apply the chat template to generate inputs
-    let inputs = infer.apply_chat_template(guideline, messages, None)?;
-    Ok((inputs, None, false))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::ChatTemplateVersions;
-    use crate::HubTokenizerConfig;
-    use crate::TokenizerConfigToken;
-    use crate::Tool;
-
-    use serde_json::json;
-
-    #[test]
-    fn test_prepare_chat_input() {
-        // Mock Backend to avoid network requests
-        struct MockBackend;
-
-        impl Backend for MockBackend {
-            fn schedule(
-                &self,
-                _request: crate::validation::ValidGenerateRequest,
-            ) -> Result<
-                tokio_stream::wrappers::UnboundedReceiverStream<
-                    Result<InferStreamResponse, InferError>,
-                >,
-                InferError,
-            > {
-                unimplemented!("Never called in this test");
-            }
-            fn health<'a, 'async_trait>(
-                &'a self,
-                _current_health: bool,
-            ) -> core::pin::Pin<
-                Box<dyn core::future::Future<Output = bool> + core::marker::Send + 'async_trait>,
-            >
-            where
-                'a: 'async_trait,
-                Self: 'async_trait,
-            {
-                unimplemented!("Never called in this test");
-            }
-        }
-
-        let backend = MockBackend {};
-
-        let mut tokenizer_config = HubTokenizerConfig::default();
-
-        // mock tokenizer config values
-        tokenizer_config.bos_token = Some(TokenizerConfigToken::String("<s>".to_string()));
-        tokenizer_config.eos_token = Some(TokenizerConfigToken::String("</s>".to_string()));
-        tokenizer_config.chat_template = Some(
-            ChatTemplateVersions::Single("{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n".to_string())
-        );
-
-        let infer = Infer::new(
-            backend,
-            Validation::new(1, None, None, None, 1, 1, 1, 1, 1, false),
-            1,
-            tokenizer_config,
-            HubProcessorConfig::default(),
-        );
-        let response_format = None;
-        let tools = Some(vec![Tool {
-            r#type: "function".to_string(),
-            function: FunctionDefinition {
-                name: "get_current_weather".to_string(),
-                description: Some("Get the current weather".to_string()),
-                arguments: json!({
-                    "type": "object",
-                    "properties": {
-                        "location": {
-                            "type": "string",
-                            "description": "The city and state, e.g. San Francisco, CA"
-                        },
-                        "format": {
-                            "type": "string",
-                            "enum": ["celsius", "fahrenheit"],
-                            "description": "The temperature unit to use. Infer this from the users location."
-                        }
-                    },
-                    "required": ["location", "format"]
-                }),
-            },
-        }]);
-        let tool_prompt = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.";
-        let guideline = None;
-        let messages = vec![Message {
-            name: None,
-            role: "user".to_string(),
-            content: MessageContent::SingleText(
-                "What is the weather like in New York?".to_string(),
-            ),
-        }];
-
-        let result = prepare_chat_input(
-            &infer,
-            response_format,
-            tools,
-            ToolChoice(None),
-            tool_prompt,
-            guideline,
-            messages,
-        );
-
-        assert!(result.is_ok());
-        let (inputs, _grammar, using_tools) = result.expect("Failed to prepare chat input");
-        assert_eq!(using_tools, true);
-        assert_eq!(inputs, "<s>[AVAILABLE_TOOLS] [{\"type\": \"function\", \"function\": {\"arguments\": {\"properties\":{\"format\":{\"description\":\"The temperature unit to use. Infer this from the users location.\",\"enum\":[\"celsius\",\"fahrenheit\"],\"type\":\"string\"},\"location\":{\"description\":\"The city and state, e.g. San Francisco, CA\",\"type\":\"string\"}},\"required\":[\"location\",\"format\"],\"type\":\"object\"}, \"description\": \"Get the current weather\", \"name\": \"get_current_weather\"}}, {\"type\": \"function\", \"function\": {\"arguments\": {\"properties\":{\"content\":{\"description\":\"The response content\",\"type\":\"string\"}},\"required\":[\"content\"],\"type\":\"object\"}, \"description\": \"Open ened response with no specific tool selected\", \"name\": \"no_tool\"}}][/AVAILABLE_TOOLS][INST] What is the weather like in New York?\n---\nGiven the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.[/INST]".to_string());
-    }
-}
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -3,7 +3,9 @@ use crate::config::Config;
 use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
 use crate::{
    GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
+    TokenizerTrait,
 };
+use crate::{PyTokenizer, Tokenizer};
 use base64::{engine::general_purpose::STANDARD, Engine};
 use image::{ImageFormat, ImageReader};
 use jsonschema::{Draft, JSONSchema};
@ -13,7 +15,6 @@ use std::io::Cursor;
 use std::iter;
 use std::sync::Arc;
 use thiserror::Error;
-use tokenizers::tokenizer::Tokenizer;
 use tokio::sync::mpsc;
 use tokio::sync::oneshot;
 use tracing::{instrument, Span};
@ -30,14 +31,14 @@ pub struct Validation {
    max_total_tokens: usize,
    disable_grammar_support: bool,
    /// Channel to communicate with the background tokenization task
-    sender: Option<mpsc::UnboundedSender<TokenizerRequest>>,
+    sender: mpsc::UnboundedSender<TokenizerRequest>,
 }

 impl Validation {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        workers: usize,
-        tokenizer: Option<Tokenizer>,
+        tokenizer: Tokenizer,
        config: Option<Config>,
        preprocessor_config: Option<HubPreprocessorConfig>,
        max_best_of: usize,
@ -47,8 +48,13 @@ impl Validation {
        max_total_tokens: usize,
        disable_grammar_support: bool,
    ) -> Self {
+        let workers = if let Tokenizer::Python { .. } = &tokenizer {
+            1
+        } else {
+            workers
+        };
        // If we have a fast tokenizer
-        let sender = if let Some(tokenizer) = tokenizer {
+        let sender = {
            // Create round robin channel
            let (validation_sender, validation_round_robin_receiver) = mpsc::unbounded_channel();
            let mut senders = Vec::with_capacity(workers);
@ -75,9 +81,7 @@ impl Validation {
            // Create tokenization round robin task
            tokio::spawn(round_robin_task(validation_round_robin_receiver, senders));

-            Some(validation_sender)
-        } else {
-            None
+            validation_sender
        };

        Self {
@ -97,14 +101,14 @@ impl Validation {
        inputs: String,
        add_special_tokens: bool,
        truncate: Option<usize>,
-    ) -> Result<Option<(tokenizers::Encoding, Vec<Chunk>)>, ValidationError> {
+    ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
        // If we have a fast tokenizer
-        if let Some(sender) = &self.sender {
        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send request to the background validation task
        // Unwrap is safe here
-            sender
+        let _ = &self
+            .sender
            .send((
                (inputs, add_special_tokens, truncate),
                response_sender,
@ -115,10 +119,7 @@ impl Validation {
        // Await on response channel
        // Unwrap is safe here
        let encoding = response_receiver.await.unwrap()?;
-            Ok(Some(encoding))
-        } else {
-            Ok(None)
-        }
+        Ok(encoding)
    }

    #[allow(clippy::type_complexity)]
@ -131,10 +132,9 @@ impl Validation {
        max_new_tokens: Option<u32>,
    ) -> Result<(Vec<Chunk>, Option<Vec<u32>>, usize, u32), ValidationError> {
        // If we have a fast tokenizer
-        if let Some((encoding, inputs)) = self
+        let (encoding, inputs) = self
            .tokenize(inputs.clone(), add_special_tokens, truncate)
-            .await?
-        {
+            .await?;
        // Create response channel
        let input_length = if let Some(truncate) = truncate {
            std::cmp::min(encoding.len(), truncate)
@ -173,35 +173,6 @@ impl Validation {
        metrics::histogram!("tgi_request_input_length").record(input_length as f64);
        Ok((inputs, Some(input_ids), input_length, max_new_tokens))
    }
-        // Return inputs without validation
-        else {
-            // In this case, we don't know the real length in tokens of the inputs
-            // However, the inputs will be truncated by the python servers
-            // We make sure that truncate + max_new_tokens <= self.max_total_tokens
-            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
-                max_new_tokens
-            } else if let Some(truncate) = truncate {
-                self.max_total_tokens.saturating_sub(truncate) as u32
-            } else {
-                return Err(ValidationError::UnsetMaxNewTokens);
-            };
-            let mut input_length = truncate.unwrap_or(self.max_input_length);
-
-            // We don't have a tokenizer, therefore we have no idea how long is the query, let
-            // them through and hope for the best.
-            // Validate MaxNewTokens
-            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
-                input_length = input_length.saturating_sub(max_new_tokens as usize);
-            }
-
-            Ok((
-                vec![Chunk::Text(inputs)],
-                None,
-                input_length,
-                max_new_tokens,
-            ))
-        }
-    }

    /// Validate a payload and get the number of tokens in the input
    #[instrument(skip_all)]
@ -464,6 +435,15 @@ fn tokenizer_worker(
    preprocessor_config: Option<HubPreprocessorConfig>,
    mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
 ) {
+    match tokenizer {
+        Tokenizer::Python {
+            tokenizer_name,
+            revision,
+            trust_remote_code,
+        } => {
+            pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
+                let tokenizer =
+                    PyTokenizer::from_py(py, tokenizer_name, revision, trust_remote_code)?;
                // Loop over requests
                while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                    receiver.blocking_recv()
@ -481,6 +461,29 @@ fn tokenizer_worker(
                            .unwrap_or(())
                    })
                }
+                Ok(())
+            })
+            .expect("Failure in python tokenizer worker");
+        }
+        Tokenizer::Rust(tokenizer) => {
+            while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
+                receiver.blocking_recv()
+            {
+                parent_span.in_scope(|| {
+                    response_tx
+                        .send(prepare_input(
+                            inputs,
+                            truncate,
+                            add_special_tokens,
+                            &tokenizer,
+                            config.as_ref(),
+                            preprocessor_config.as_ref(),
+                        ))
+                        .unwrap_or(())
+                })
+            }
+        }
+    }
 }

 fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
@ -593,6 +596,10 @@ fn image_tokens(
        }
        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
+        Qwen2Vl(config) => format!(
+            "<|vision_start|>{:?}<|vision_end|>",
+            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
+        ),
        _ => unimplemented!("Images tokens are not supported for this model configuration"),
    }
 }
@ -608,18 +615,20 @@ fn image_tokens_fixup(config: &Config, text: String) -> String {
 }

 /// Get input length and optionally truncate it
-fn prepare_input(
+fn prepare_input<T: TokenizerTrait>(
    inputs: String,
    _truncate: Option<usize>,
    add_special_tokens: bool,
-    tokenizer: &Tokenizer,
+    tokenizer: &T,
    config: Option<&Config>,
    preprocessor_config: Option<&HubPreprocessorConfig>,
 ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
    use Config::*;
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
-        Some(config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
+        Some(
+            config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_)),
+        ) => {
            let mut input_chunks = Vec::new();
            let mut tokenizer_query = String::with_capacity(inputs.len());
            let mut start = 0;
@ -649,7 +658,7 @@ fn prepare_input(

    // Get the number of tokens in the input
    let encoding = tokenizer
-        .encode(tokenizer_query, add_special_tokens)
+        .encode_trait(tokenizer_query, add_special_tokens)
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;

    Ok((encoding, input_chunks))
@ -824,7 +833,7 @@ mod tests {

    #[tokio::test]
    async fn test_validation_max_new_tokens() {
-        let tokenizer = None;
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -851,15 +860,15 @@ mod tests {
            .validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
            .await
        {
-            // Err(ValidationError::MaxNewTokens(1, 10)) => (),
-            Ok((_s, _, 0, 10)) => (),
+            Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
+            // Ok((_s, _, 0, 10)) => (),
            r => panic!("Unexpected not max new tokens: {r:?}"),
        }
    }

    #[tokio::test]
    async fn test_validation_input_length() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -893,7 +902,7 @@ mod tests {

    #[tokio::test]
    async fn test_validation_best_of_sampling() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -933,7 +942,7 @@ mod tests {

    #[tokio::test]
    async fn test_validation_top_p() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -1004,7 +1013,7 @@ mod tests {

    #[tokio::test]
    async fn test_validation_top_n_tokens() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequences = 3;
        let max_top_n_tokens = 4;
@ -1089,7 +1098,7 @@ mod tests {
    async fn test_prepare_input_chunks() {
        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();

-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();

        let max_best_of = 2;
        let max_stop_sequence = 3;
@ -1124,7 +1133,7 @@ mod tests {
            )
            .await
        {
-            Ok(Some((_encoding, chunks))) => chunks,
+            Ok((_encoding, chunks)) => chunks,
            _ => panic!("Unexpected tokenization failure"),
        };

@ -1146,7 +1155,7 @@ mod tests {
    async fn test_idefics2_correct_n_fake_tokens() {
        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();

-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();

        let max_best_of = 2;
        let max_stop_sequence = 3;
@ -1184,7 +1193,7 @@ mod tests {
            )
            .await
        {
-            Ok(Some((encoding, chunks))) => (encoding, chunks),
+            Ok((encoding, chunks)) => (encoding, chunks),
            _ => panic!("Unexpected tokenization failure"),
        };

--- a/server/Makefile
+++ b/server/Makefile
@ -5,7 +5,6 @@ include Makefile-awq
 include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
-include Makefile-fbgemm
 include Makefile-exllamav2
 include Makefile-flashinfer

@ -24,14 +23,14 @@ gen-server:
 install-server: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, quantize, peft, outlines]"
+	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"


 install: install-cuda
 	echo "Installed server"

-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
-	pip install -e ".[bnb,marlin,moe]"
+install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
+	pip install -e ".[attention,bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3

 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
--- a/server/Makefile-fbgemm
+++ b/server/Makefile-fbgemm
@ -1,15 +0,0 @@
-fbgemm_commit := v0.8.0
-
-build-fbgemm:
-	@if [ ! -d "fbgemm" ]; then \
-		git clone https://github.com/pytorch/FBGEMM.git fbgemm; \
-	fi
-	cd fbgemm && git fetch && git checkout $(fbgemm_commit)  && \
-	git submodule update --init --recursive && \
-	cd fbgemm_gpu && \
-	pip install -r requirements.txt && \
-	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai build
-
-install-fbgemm: build-fbgemm
-	cd fbgemm/fbgemm_gpu &&  \
-	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai install
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@ -1,14 +1,4 @@
-commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
 commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
-build-vllm-cuda:
-	if [ ! -d 'vllm' ]; then \
-		pip install -U ninja packaging --no-cache-dir && \
-		git clone https://github.com/Narsil/vllm.git vllm; \
-	fi
-	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
-
-install-vllm-cuda: build-vllm-cuda
-	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .

 build-vllm-rocm:
 	if [ ! -d 'vllm' ]; then \
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.

 [[package]]
 name = "accelerate"
@ -167,6 +167,17 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"

+[[package]]
+name = "airportsdata"
+version = "20241001"
+description = "Extensive database of location and timezone data for nearly every airport and landing strip in the world."
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd"},
+    {file = "airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00"},
+]
+
 [[package]]
 name = "annotated-types"
 version = "0.7.0"
@ -189,6 +200,74 @@ files = [
    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]

+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+description = "Attention kernels"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+
 [[package]]
 name = "attrs"
 version = "24.2.0"
@ -388,6 +467,26 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]

+[[package]]
+name = "compressed-tensors"
+version = "0.7.1"
+description = "Library for utilization of compressed safetensors of neural network models"
+optional = true
+python-versions = "*"
+files = [
+    {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
+    {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
+]
+
+[package.dependencies]
+pydantic = ">=2.0"
+torch = ">=1.7.0"
+transformers = "*"
+
+[package.extras]
+accelerate = ["accelerate"]
+dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
+
 [[package]]
 name = "datasets"
 version = "2.21.0"
@ -518,88 +617,103 @@ typing = ["typing-extensions (>=4.12.2)"]

 [[package]]
 name = "frozenlist"
-version = "1.4.1"
+version = "1.5.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
-    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
-    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"},
+    {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"},
+    {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"},
+    {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"},
+    {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"},
 ]

 [[package]]
@ -1008,17 +1122,6 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]

-[[package]]
-name = "joblib"
-version = "1.4.2"
-description = "Lightweight pipelining with Python functions"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
-    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
-]
-
 [[package]]
 name = "jsonschema"
 version = "4.23.0"
@ -1071,36 +1174,6 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"]
 nearley = ["js2py"]
 regex = ["regex"]

-[[package]]
-name = "llvmlite"
-version = "0.43.0"
-description = "lightweight wrapper around basic LLVM functionality"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
-    {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
-    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
-    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
-    {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
-    {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
-    {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
-    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
-    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
-    {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
-    {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
-    {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
-    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
-    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
-    {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
-    {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
-    {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
-    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
-    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
-    {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
-    {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
-]
-
 [[package]]
 name = "loguru"
 version = "0.6.0"
@ -1215,12 +1288,12 @@ files = [

 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.5"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:a2086b9e98d22071f52c5b4b4b98b1b4a988565258905173fa74c5a9eddd1a0a"},
+    {file = "marlin_kernels-0.3.5+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:58d4bf0aa1a9533acc05f1e5bf50f727ed0129848d1fa1feb2c5c3fa482518d4"},
 ]

 [package.dependencies]
@ -1228,16 +1301,16 @@ torch = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp310-cp310-linux_x86_64.whl"

 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.5"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:f39a6946d8247629446ec170832d832c7038c363f1d8803211fe67249c2d804d"},
+    {file = "marlin_kernels-0.3.5+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:a3a3653e6908db013ca96979a5ee1f6a8bb590ee7506a129e06b87d4a8cbb87d"},
 ]

 [package.dependencies]
@ -1245,16 +1318,16 @@ torch = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp311-cp311-linux_x86_64.whl"

 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.5"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:07fd869d5289777fa866107dae676523e18b1f6ba4afce79946ddc58a6870169"},
+    {file = "marlin_kernels-0.3.5+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:967b4765a591530a4b9160ae32f3f352a89ae4c71daf43220c99976987d76723"},
 ]

 [package.dependencies]
@ -1262,16 +1335,16 @@ torch = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp312-cp312-linux_x86_64.whl"

 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.5"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:0dedaa418225d490a5f1d8f85dbc75e439a8c43a8870e4ef32945bf61672d7dc"},
+    {file = "marlin_kernels-0.3.5+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:fbe607d5afd1e1fca6e294c3594a0ec279d1f9ea6a2fdf7f34ccb6180d15e195"},
 ]

 [package.dependencies]
@ -1279,7 +1352,7 @@ torch = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp39-cp39-linux_x86_64.whl"

 [[package]]
 name = "mdurl"
@ -1294,12 +1367,12 @@ files = [

 [[package]]
 name = "moe-kernels"
-version = "0.6.0"
+version = "0.7.0"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.6.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f28fd2a56c3ac7bfe74bc44cc7c8c0791a2644ad689b084ea4ed6decb7f41c25"},
+    {file = "moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f8c126395f11522881c6bf1f6120e3670822006a84e2ff74af561c22445746b3"},
 ]

 [package.dependencies]
@ -1309,16 +1382,16 @@ triton = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"

 [[package]]
 name = "moe-kernels"
-version = "0.6.0"
+version = "0.7.0"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.6.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:db475948fd9f7a8647aa3f73256ff4d3bb111425305bcd0b0d3559ccc75b8937"},
+    {file = "moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:2afff8346251f01d5d90bab738e3dfaa6b14a414a9c88205d396ab2bae87983a"},
 ]

 [package.dependencies]
@ -1328,16 +1401,16 @@ triton = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"

 [[package]]
 name = "moe-kernels"
-version = "0.6.0"
+version = "0.7.0"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.6.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:364be07c06aafbab1f51d9e26d9a4ff658defe1462a4c645abaf7b895ed163a8"},
+    {file = "moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:b1a29e33d3b7d85e2b4f8bd47db28211096d1f645e0868d5a1f3666ebb9bd9e3"},
 ]

 [package.dependencies]
@ -1347,16 +1420,16 @@ triton = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"

 [[package]]
 name = "moe-kernels"
-version = "0.6.0"
+version = "0.7.0"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:81e7fa25fb5ed5336f5151994f5e3f600df7e166fe013576968c59415e442894"},
+    {file = "moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:9573611174cda9f6fafa1816521e38582fd2903b321bbaf78f83cf6e3189ac7d"},
 ]

 [package.dependencies]
@ -1366,7 +1439,7 @@ triton = "*"

 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"

 [[package]]
 name = "mpmath"
@ -1542,40 +1615,6 @@ doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
 test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]

-[[package]]
-name = "numba"
-version = "0.60.0"
-description = "compiling Python code using LLVM"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
-    {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
-    {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
-    {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
-    {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
-    {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
-    {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
-    {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
-    {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
-    {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
-    {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
-    {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
-    {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
-    {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
-    {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
-    {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
-    {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
-    {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
-    {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
-    {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
-    {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
-]
-
-[package.dependencies]
-llvmlite = "==0.43.*"
-numpy = ">=1.22,<2.1"
-
 [[package]]
 name = "numpy"
 version = "1.26.4"
@ -1770,6 +1809,7 @@ description = "Nvidia JIT LTO Library"
 optional = true
 python-versions = ">=3"
 files = [
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
 ]
@ -1952,36 +1992,83 @@ opentelemetry-api = "1.25.0"

 [[package]]
 name = "outlines"
-version = "0.0.34"
+version = "0.1.3"
 description = "Probabilistic Generative Model Programming"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
-    {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
+    {file = "outlines-0.1.3-py3-none-any.whl", hash = "sha256:afcf6012b7cabbaae4a58975d03190c0bbc3d402b0b2a37538e05f335d73a247"},
+    {file = "outlines-0.1.3.tar.gz", hash = "sha256:5a48ad00d3bdd8eccaa7574821eb5aaa27ab9f61fde9c3fba52f352dc00197e4"},
 ]

 [package.dependencies]
+airportsdata = "*"
 cloudpickle = "*"
+datasets = "*"
 diskcache = "*"
 interegular = "*"
 jinja2 = "*"
-joblib = "*"
 jsonschema = "*"
 lark = "*"
 nest-asyncio = "*"
-numba = "*"
-numpy = "*"
+numpy = "<2.0.0"
+outlines-core = "0.1.14"
+pycountry = "*"
 pydantic = ">=2.0"
 referencing = "*"
 requests = "*"
-scipy = "*"
-torch = ">=2.1.0"
-transformers = "*"
+torch = "*"
+tqdm = "*"
+typing-extensions = "*"

 [package.extras]
-serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
-test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
+serve = ["fastapi", "pydantic (>=2.0)", "uvicorn", "vllm (>=0.3.0)"]
+test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "diff-cover", "exllamav2", "huggingface-hub", "llama-cpp-python", "mlx-lm", "openai (>=1.0.0)", "pillow", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers", "vllm"]
+
+[[package]]
+name = "outlines-core"
+version = "0.1.14"
+description = "Structured Text Generation in Rust"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "outlines_core-0.1.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:291c6d9d348cb5562cd28ce44d80822d77238f1cd7c30d890b5b20488e71608d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3a50e2f6745e0c34cc857d1bd5590e2966ad06e8ce10802976e9e6c116c7533d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7dfe64b590a6a88dcc5e59f0a399fff0458cdcf97d68de07f08e1bd3bf8ac1d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:100de068ce52893bec316481e65db8f1c734a0f25f540c29dafd7a8afec0a29d"},
+    {file = "outlines_core-0.1.14-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e06cb724770fd0fe1c8444382c4a6e79901bba33720f70fe6c8437f58eceb92e"},
+    {file = "outlines_core-0.1.14-cp310-cp310-win32.whl", hash = "sha256:6d41da3d8a087fd54133cf910c2d5759da55490bbd0e3bc6c1e7907b54248415"},
+    {file = "outlines_core-0.1.14-cp310-cp310-win_amd64.whl", hash = "sha256:646fd1073feed393bc77f9605a2fa27a54551ab04f85867ce789af1dee6326fa"},
+    {file = "outlines_core-0.1.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60f3a947fe09106f7668cf832c28b9269b8f0fc109f081608acfce9262213359"},
+    {file = "outlines_core-0.1.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e273a100c922f794d8e077a8161d0985d3005887066b4af3ae7afd3742fe9b8"},
+    {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:622e547f11a869fc67be40abc4cbcda89ae6f46f9eb46a1ec0666bd6807e0c67"},
+    {file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:60c9933a9faaa51b39aea3518f1822b0d3ec2c9a13b16849caca3955e29e320d"},
+    {file = "outlines_core-0.1.14-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a8c616ce103ef9102dbf4326f67b03e1e0f46aa19351e57f4beb37588c00428"},
+    {file = "outlines_core-0.1.14-cp311-cp311-win32.whl", hash = "sha256:1c77aaa4556cbb6e93cc42be0a6e262f175e0754b7694d702d642ff03df67f2c"},
+    {file = "outlines_core-0.1.14-cp311-cp311-win_amd64.whl", hash = "sha256:eb6ffe410866f65dbe17e95b0aabd70d990f058a2dc4e8b74f9583b07248cd36"},
+    {file = "outlines_core-0.1.14-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b0e408b033618f23e9bb928a47b33b1bd4c9d04a3dbec680a20977de3b4f590d"},
+    {file = "outlines_core-0.1.14-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:21d1393a6da5d3320e8c8247e9deeb851c5c862fd6ea5c779bd29797e8987155"},
+    {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5829c568db76673d36caaf0f86e96748b491b4a209deb9be87617372394a5fb9"},
+    {file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e855ec99bce1099c0755bcbfa44568adf7ae0083905ba04f58a17614ddf0fe7"},
+    {file = "outlines_core-0.1.14-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b897cfbf9c2719aa011d9b439b4c6751d9c7df5683b2169617972d4b4a914403"},
+    {file = "outlines_core-0.1.14-cp38-cp38-win32.whl", hash = "sha256:4c9d908004b31bcd432156d60f4895bf5e1b51ca8c8eed82b12f1bb57d5bf7fd"},
+    {file = "outlines_core-0.1.14-cp38-cp38-win_amd64.whl", hash = "sha256:6668a930d928216d0b319ad84947903f1e27556f604a9743051f795b11008b64"},
+    {file = "outlines_core-0.1.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b745aa469cf3fb347b79a257804d75d1324e01691158664c1e413a816ce6b98d"},
+    {file = "outlines_core-0.1.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:27504c8360467429d6223ebc49180d6956d7418bfc3d324f6ad10f069e1813ad"},
+    {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd8f1e1d91a206a520d1c577ce00136de2beb1d200ef93759fd4c9f45abe24d3"},
+    {file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f30c8acb42895b624c504b85678331c5f9376fa4b8069ce06a27cf80f5881e27"},
+    {file = "outlines_core-0.1.14-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0e6cd0e7d995a7b04d90139a695279ab4a9eb7f492618b2c037a85eaf5f9fc59"},
+    {file = "outlines_core-0.1.14-cp39-cp39-win32.whl", hash = "sha256:3104af4084da0e7c3d4b8538b43c725581d66bb68d426bc389680f06c3667476"},
+    {file = "outlines_core-0.1.14-cp39-cp39-win_amd64.whl", hash = "sha256:45c6b9baded0337c4dcfa156af05ec4efd2b25c4d976e77be28146e4037b991f"},
+    {file = "outlines_core-0.1.14.tar.gz", hash = "sha256:6db033e4f8e48381164e36cc716746640ad5022f0d86e4c88af15c75886b93a4"},
+]
+
+[package.dependencies]
+interegular = "*"
+jsonschema = "*"
+
+[package.extras]
+test = ["accelerate", "asv", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "pillow", "pre-commit", "pydantic", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "setuptools-rust", "torch", "transformers"]

 [[package]]
 name = "packaging"
@ -2454,6 +2541,17 @@ numpy = ">=1.16.6"
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]

+[[package]]
+name = "pycountry"
+version = "24.6.1"
+description = "ISO country, subdivision, language, currency and script definitions and their translations"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"},
+    {file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"},
+]
+
 [[package]]
 name = "pydantic"
 version = "2.9.2"
@ -3465,13 +3563,13 @@ telegram = ["requests"]

 [[package]]
 name = "transformers"
-version = "4.45.2"
+version = "4.46.0"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"},
-    {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"},
+    {file = "transformers-4.46.0-py3-none-any.whl", hash = "sha256:e161268ae8bee315eb9e9b4c0b27f1bd6980f91e0fc292d75249193d339704c0"},
+    {file = "transformers-4.46.0.tar.gz", hash = "sha256:3a9e2eb537094db11c3652334d281afa4766c0e5091c4dcdb454e9921bb0d2b7"},
 ]

 [package.dependencies]
@ -3489,13 +3587,13 @@ tqdm = ">=4.27"
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
 agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
 codecarbon = ["codecarbon (==1.2.0)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
 deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"]
 dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@ -3529,7 +3627,7 @@ torch = ["accelerate (>=0.26.0)", "torch"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
 torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)", "decord (==0.6.0)"]
+video = ["av (==9.2.0)"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]

 [[package]]
@ -3955,7 +4053,9 @@ type = ["pytest-mypy"]

 [extras]
 accelerate = ["accelerate"]
+attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"]
 bnb = ["bitsandbytes"]
+compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
 moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"]
 outlines = ["outlines"]
@ -3966,4 +4066,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "500fa44255e4a6c89a16314a931548447afe1ba71ea341a73cad6670e46ddac7"
+content-hash = "7082f1983403ff58a1f0304e8bbf1197715b5156ddeea0f3e8287334d52c2617"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app'

 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.25.3"
+protobuf = ">=4.25.3,<6"
 grpcio = "^1.51.1"
 grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
@ -34,30 +34,39 @@ peft = { version = "^0.10", optional = true }
 torch = { version = "^2.4.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
-outlines= { version = "^0.0.34", optional = true }
-prometheus-client = "^0.20.0"
+outlines= { version = "0.1.3", optional = true }
+prometheus-client = ">=0.20.0,<0.22"
 py-cpuinfo = "^9.0.0"
+compressed-tensors = { version = "^0.7.1", optional = true }
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26"

+attention-kernels = [
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+]
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.5/marlin_kernels-0.3.5+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 rich = "^13.7.1"

 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
+attention = ["attention-kernels"]
 bnb = ["bitsandbytes"]
+compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
 moe = ["moe-kernels"]
 peft = ["peft"]
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
--- a/Show More
+++ b/Show More