Merge branch 'main' into feat/add-load-test

2025-09-17 23:34:52 +00:00 · 2024-08-02 14:44:37 +02:00 · 2024-08-02 14:44:37 +02:00 · 28b8a4287d
commit 28b8a4287d
parent 91a8972e18 47447ef017
245 changed files with 14604 additions and 9309 deletions
--- a/server/marlin/marlin_kernels/py.typed
+++ b/server/marlin/marlin_kernels/py.typed
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
--- a/.dockerignore
+++ b/.dockerignore
@ -2,3 +2,5 @@ aml
 target
 server/transformers
 server/flash-attention
+cmake-build-debug/
+cmake-build-release/
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@ -28,7 +28,7 @@ jobs:

    - name: Install router
      id: install-router
-      run: cargo install --path router/
+      run: cargo install --path backends/v3/

    - uses: actions/setup-node@v4
      with:
@ -41,5 +41,5 @@ jobs:

    - name: Check that documentation is up-to-date
      run: |
-        npm install -g swagger-cli
+        npm install -g @redocly/cli
        python update_doc.py --check
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -27,8 +27,8 @@ jobs:
    concurrency:
      group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    # TODO see with @Glegendre to get CPU runner here instead
-    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
+    runs-on:
+      group: aws-highmemory-32-plus-priv
    permissions:
      contents: write
      packages: write
@ -49,7 +49,7 @@ jobs:
                export dockerfile="Dockerfile"
                export label_extension=""
                export docker_devices=""
-                export runs_on="nvidia-gpu"
+                export runs_on="aws-g6-12xlarge-plus-priv"
                ;;
            rocm)
                export dockerfile="Dockerfile_amd"
@ -75,13 +75,18 @@ jobs:
          echo "LABEL=${label_extension}" >> $GITHUB_ENV
          echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
          echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
+          echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
      - name: Initialize Docker Buildx
        uses: docker/setup-buildx-action@v3
        with:
          install: true
-          config-inline: |
-            [registry."docker.io"]
-              mirrors = ["registry.github-runners.huggingface.tech"]
+          buildkitd-config: /tmp/buildkitd.toml
+      - name: Login to internal Container Registry
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.REGISTRY_USERNAME }}
+          password: ${{ secrets.REGISTRY_PASSWORD }}
+          registry: registry.internal.huggingface.tech
      - name: Login to GitHub Container Registry
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v3
@ -103,7 +108,7 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: |
-            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
          tags: |
            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
      # If main, release or tag
@ -115,7 +120,7 @@ jobs:
          flavor: |
            latest=auto
          images: |
-            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
            ghcr.io/huggingface/text-generation-inference
            db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
          tags: |
@ -141,7 +146,7 @@ jobs:
      - name: Final
        id: final
        run: |
-          echo "docker_image=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
          echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
          echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
          echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
@ -150,7 +155,8 @@ jobs:
      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    needs: build-and-push
-    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
+    runs-on:
+      group: ${{ needs.build-and-push.outputs.runs_on }}
    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
    env:
      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@ -10,6 +10,7 @@ on:
    paths:
      - ".github/workflows/build.yaml"
      - "integration-tests/**"
+      - "backends/**"
      - "server/**"
      - "proto/**"
      - "router/**"
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -23,7 +23,8 @@ jobs:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    runs-on: [ self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci ]
+    runs-on:
+      group: aws-g5-12xlarge
    env:
      DOCKER_VOLUME: /cache
    steps:
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,10 @@ target
 router/tokenizer.json
 *__pycache__*

+backends/v3/src/client/pb
+backends/client/src/v2/pb
+backends/client/src/v3/pb
+
 # ROCm auto-generated files
 *.hip
 server/exllamav2_kernels/exllamav2_kernels/hip/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -13,6 +13,11 @@ repos:
 -   repo: https://github.com/doublify/pre-commit-rust
    rev: v1.0
    hooks:
-    -   id: fmt
    -   id: cargo-check
+    -   id: fmt
    -   id: clippy
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.0
+    hooks:
+    -   id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
--- a/.redocly.lint-ignore.yaml
+++ b/.redocly.lint-ignore.yaml
@ -0,0 +1,79 @@
+# This file instructs Redocly's linter to ignore the rules contained for specific parts of your API.
+# See https://redoc.ly/docs/cli/ for more information.
+docs/openapi.json:
+  no-empty-servers:
+    - '#/openapi'
+  spec:
+    - >-
+      #/components/schemas/GenerateParameters/properties/best_of/exclusiveMinimum
+    - >-
+      #/components/schemas/GenerateParameters/properties/frequency_penalty/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/grammar/nullable'
+    - >-
+      #/components/schemas/GenerateParameters/properties/repetition_penalty/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/seed/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/temperature/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/top_k/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/top_n_tokens/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/top_p/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/typical_p/exclusiveMinimum
+    - '#/components/schemas/GenerateResponse/properties/details/nullable'
+    - '#/components/schemas/StreamResponse/properties/details/nullable'
+    - '#/components/schemas/ChatRequest/properties/response_format/nullable'
+    - '#/components/schemas/ChatRequest/properties/tool_choice/nullable'
+    - '#/components/schemas/ToolChoice/nullable'
+    - '#/components/schemas/ChatCompletionComplete/properties/logprobs/nullable'
+    - '#/components/schemas/ChatCompletionChoice/properties/logprobs/nullable'
+  no-invalid-media-type-examples:
+    - '#/paths/~1/post/responses/422/content/application~1json/example'
+    - '#/paths/~1/post/responses/424/content/application~1json/example'
+    - '#/paths/~1/post/responses/429/content/application~1json/example'
+    - '#/paths/~1/post/responses/500/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/422/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/424/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/429/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/500/content/application~1json/example'
+    - >-
+      #/paths/~1generate_stream/post/responses/422/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/424/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/429/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/500/content/text~1event-stream/example
+    - '#/paths/~1tokenize/post/responses/404/content/application~1json/example'
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/422/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/424/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/429/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/500/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/422/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/424/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/429/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/500/content/application~1json/example
+  operation-4xx-response:
+    - '#/paths/~1health/get/responses'
+    - '#/paths/~1info/get/responses'
+    - '#/paths/~1metrics/get/responses'
+  no-unused-components:
+    - '#/components/schemas/Completion'
+  security-defined:
+    - '#/paths/~1/post'
+    - '#/paths/~1generate/post'
+    - '#/paths/~1generate_stream/post'
+    - '#/paths/~1health/get'
+    - '#/paths/~1info/get'
+    - '#/paths/~1metrics/get'
+    - '#/paths/~1tokenize/post'
+    - '#/paths/~1v1~1chat~1completions/post'
+    - '#/paths/~1v1~1completions/post'
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,15 +1,24 @@
 [workspace]
 members = [
-    "benchmark",
-    "router",
-    "router/client",
-    "router/grpc-metadata",
-    "launcher"
+  "benchmark",
+  "backends/v3",
+  "backends/grpc-metadata",
+  "backends/trtllm",
+  "backends/client",
+  "launcher"
+]
+default-members = [
+  "benchmark",
+  "backends/v3",
+  "backends/grpc-metadata",
+  # "backends/trtllm",
+  "backends/client",
+  "launcher"
 ]
 resolver = "2"

 [workspace.package]
-version = "2.1.2-dev0"
+version = "2.2.1-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
@ -18,6 +27,8 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 base64 = "0.22.0"
 tokenizers = { version = "0.19.1", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }

 [profile.release]
 incremental = true
--- a/31
+++ b/31
@ -11,6 +11,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json

@ -33,6 +34,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt

@ -41,7 +43,7 @@ RUN cargo build --profile release-opt
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install

 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
-ARG PYTORCH_VERSION=2.3.0
+ARG PYTORCH_VERSION=2.4.0

 ARG PYTHON_VERSION=3.10
 # Keep in sync with `server/pyproject.toml
@ -140,13 +142,6 @@ COPY server/Makefile-eetq Makefile
 # Build specific version of transformers
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

-# Build marlin kernels
-FROM kernel-builder AS marlin-kernels-builder
-WORKDIR /usr/src
-COPY server/marlin/ .
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
-
 # Build Lorax Punica kernels
 FROM kernel-builder AS lorax-punica-builder
 WORKDIR /usr/src
@ -161,6 +156,15 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build

+# Build FBGEMM CUDA kernels
+FROM kernel-builder AS fbgemm-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-fbgemm Makefile
+
+RUN make build-fbgemm
+
 # Build vllm CUDA kernels
 FROM kernel-builder AS vllm-builder

@ -222,13 +226,10 @@ COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-31
 COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from eetq kernels builder
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from marlin kernels builder
-COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
-# Copy builds artifacts from vllm builder
+# Copy build artifacts from fbgemm builder
+COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
@ -243,7 +244,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
    pip install nvidia-nccl-cu12==2.22.3

 ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
--- a/Dockerfile.trtllm
+++ b/Dockerfile.trtllm
@ -0,0 +1,23 @@
+# All the tooling for CUDA
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+
+WORKDIR /usr/src/tgi/backends/trtllm
+RUN apt update && apt install -y cmake git git-lfs gcc g++ ninja-build libopenmpi-dev python3-dev python3-pip wget
+
+COPY . /usr/src/tgi
+RUN chmod +x scripts/install_tensorrt.sh && scripts/install_tensorrt.sh
+RUN cmake -G Ninja -B build -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include .
+RUN cmake --build build --parallel -t tgi_trtllm_backend_impl
+
+# All the tooling for Rust
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+# Include CUDA related libraries and tools to the Rust based image
+COPY --from=cuda-builder /usr/local/cuda /usr/local/cuda
+COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=cuda-builder /usr/src/tgi/backends/trtllm/build /usr/local/tgi/trtllm/build
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH
+
+RUN apt update && apt install -y cmake git gcc g++ ninja-build libopenmpi3
--- a/2
+++ b/2
@ -11,6 +11,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json

@ -33,6 +34,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt

--- a/2
+++ b/2
@ -12,6 +12,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json

@ -34,6 +35,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt

--- a/6
+++ b/6
@ -5,13 +5,13 @@ install-server-cpu:
 	cd server && make install-server

 install-router:
-	cd router && cargo install --path .
+	cargo install --path backends/v3/

 install-launcher:
-	cd launcher && cargo install --path .
+	cargo install --path launcher/

 install-benchmark:
-	cd benchmark && cargo install --path .
+	cargo install --path benchmark/

 install: install-server install-router install-launcher

--- a/README.md
+++ b/README.md
@ -80,7 +80,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:2.2.0 --model-id $model
 ```

 And then you can make requests like
@ -94,7 +94,7 @@ curl 127.0.0.1:8080/generate_stream \

 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.1.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.2.0-rocm --model-id $model` instead of the command above.

 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
--- a/backends/client/Cargo.toml
+++ b/backends/client/Cargo.toml
--- a/backends/client/build.rs
+++ b/backends/client/build.rs
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
--- a/backends/client/src/v2/client.rs
+++ b/backends/client/src/v2/client.rs
--- a/backends/client/src/v2/mod.rs
+++ b/backends/client/src/v2/mod.rs
--- a/backends/client/src/v2/sharded_client.rs
+++ b/backends/client/src/v2/sharded_client.rs
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
--- a/backends/client/src/v3/mod.rs
+++ b/backends/client/src/v3/mod.rs
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
--- a/backends/grpc-metadata/Cargo.toml
+++ b/backends/grpc-metadata/Cargo.toml
--- a/backends/grpc-metadata/src/lib.rs
+++ b/backends/grpc-metadata/src/lib.rs
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(tgi-trtllm-backend VERSION 1.0.0)
+set(CMAKE_CXX_STANDARD 20)
+
+include(FetchContent)
+include(ExternalProject)
+
+option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
+option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
+
+# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
+find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+
+#### External dependencies ####
+include(cmake/fmt.cmake)
+include(cmake/json.cmake)
+include(cmake/spdlog.cmake)
+include(cmake/trtllm.cmake)
+
+# Let's build TRTLLM as part of CMake
+add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
+
+# Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
+set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
+
+# TGI TRTLLM Backend definition
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
+include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+target_include_directories(tgi_trtllm_backend_impl PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:include>
+)
+target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
+
+# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
+install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+
+#### Unit Tests ####
+if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
+    message(STATUS "Building tests")
+    FetchContent_Declare(
+            Catch2
+            GIT_REPOSITORY https://github.com/catchorg/Catch2
+            GIT_TAG v3.6.0
+    )
+    FetchContent_MakeAvailable(Catch2)
+
+    #    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
+    #    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
+
+    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+    include(CTest)
+    include(Catch)
+    #    catch_discover_tests(tgi_trtllm_backend_tests)
+endif ()
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@ -0,0 +1,26 @@
+[package]
+name = "text-generation-backends-trtllm"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
+async-trait = "0.1"
+async-stream = "0.3"
+cxx = "1.0"
+text-generation-router = { path = "../../router" }
+tokenizers = { version = "0.19", features = ["hf-hub"] }
+tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.15"
+clap = { version = "4.5", features = ["derive"] }
+thiserror = "1.0.62"
+tracing = "0.1"
+tracing-opentelemetry = "0.24"
+tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+log = { version = "0.4", features = [] }
+
+[build-dependencies]
+cmake = "0.1"
+cxx-build = { version = "1.0", features = ["parallel"] }
+pkg-config = "0.3"
--- a/backends/trtllm/Dockerfile
+++ b/backends/trtllm/Dockerfile
@ -0,0 +1,100 @@
+ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
+ARG OMPI_VERSION="4.1.6"
+
+# Build dependencies resolver stage
+FROM lukemathwalker/cargo-chef:latest AS chef
+WORKDIR /usr/src/text-generation-inference
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+# CUDA dependent dependencies resolver stage
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && apt install -y \
+    build-essential \
+    cmake \
+    curl \
+    gcc  \
+    g++ \
+    git \
+    git-lfs \
+    libssl-dev \
+    ninja-build \
+    pkg-config \
+    python3 \
+    python3-setuptools \
+    tar \
+    wget
+
+ENV TGI_INSTALL_PREFIX=/usr/local/tgi
+ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
+
+# Install OpenMPI
+FROM cuda-builder AS mpi-builder
+ARG OMPI_VERSION
+
+ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
+RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
+    mkdir /usr/src/mpi && \
+    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
+    cd /usr/src/mpi && \
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --without-slurm && \
+    make -j all && \
+    make install && \
+    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
+
+# Install TensorRT
+FROM cuda-builder AS trt-builder
+COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
+RUN chmod +x /opt/install_tensorrt.sh && \
+    /opt/install_tensorrt.sh
+
+# Build Backend
+FROM cuda-builder AS tgi-builder
+WORKDIR /usr/src/text-generation-inference
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
+    chmod -R a+w /root/.rustup && \
+    chmod -R a+w /root/.cargo
+
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef
+
+# Cache dependencies
+COPY --from=planner /usr/src/text-generation-inference/recipe.json .
+RUN cargo chef cook --release --recipe-path recipe.json
+
+# Build actual TGI
+ARG CUDA_ARCH_LIST
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
+ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
+ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
+
+COPY . .
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
+    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release --bin text-generation-backends-trtllm
+
+FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
+WORKDIR /usr/local/tgi/bin
+
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
+
+FROM runtime
+
+LABEL co.huggingface.vendor="Hugging Face Inc."
+LABEL org.opencontainers.image.authors="hardware@hf.co"
+
+ENTRYPOINT ["./text-generation-launcher"]
+CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
--- a/backends/trtllm/README.md
+++ b/backends/trtllm/README.md
@ -0,0 +1,46 @@
+# Text Generation Inference - TensorRT-LLM Backend Implementation
+
+## Description
+
+This folder provides the sources of the TensorRT-LLM backend implementation powered by TensorRT-LLM Executor new API
+
+## Simplified Request Sequence
+
+```mermaid
+sequenceDiagram
+    actor User
+    participant TextGenerationInference.HttpServer
+    participant TextGenerationInference.TensorRtLlmBackend
+    participant TextGenerationInference.TensorRtLlmWorkerThread
+    participant TensorRtLlm.Executor
+    participant Nvidia.Gpu
+    User ->> TextGenerationInference.HttpServer: POST /generate
+    TextGenerationInference.HttpServer ->> TextGenerationInference.TensorRtLlmBackend: Validate and forward inputs & parameters
+    TextGenerationInference.TensorRtLlmBackend ->> TextGenerationInference.TensorRtLlmWorkerThread: Allocate a new context and spawn a new thread to handle the request
+    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Submit the request to the In-Flight Batcher
+    activate Nvidia.Gpu
+    TensorRtLlm.Executor ->> Nvidia.Gpu: Add the request to the poll for execution
+    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Response with an unique request identifier
+    rect rgb(10, 92, 54)
+        loop every 100us
+            rect rgb(15, 81, 50)
+                alt Acquire lock to query executor
+                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Poll request number of new token(s) generated
+                else There are new generated tokens
+                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Retrieve newly generated tokens
+                    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Return decoded token information and potential error (omitted)
+                    rect rgb(11, 110, 79)
+                        alt Generated token is final
+                            TensorRtLlm.Executor ->> Nvidia.Gpu: Remove request from the scheduler and from the GPU
+                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream the remaining decoded tokens and flush the connection
+                        else Generated token is not final
+                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream token back to the user as they get decoded
+                        end
+                    end
+                end
+            end
+            deactivate Nvidia.Gpu
+        end
+    end
+
+```
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@ -0,0 +1,150 @@
+use cxx_build::CFG;
+use pkg_config;
+use std::env;
+use std::env::consts::ARCH;
+use std::path::{absolute, PathBuf};
+
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
+const CUDA_REQUIRED_VERSION: &str = "12.5";
+const MPI_REQUIRED_VERSION: &str = "4.1";
+const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
+const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
+const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
+
+// Dependencies
+const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
+const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
+    ("dylib", "tensorrt_llm"),
+    ("static", "tensorrt_llm_executor_static"),
+    ("dylib", "tensorrt_llm_nvrtc_wrapper"),
+    ("dylib", "nvinfer_plugin_tensorrt_llm"),
+    ("dylib", "decoder_attention"),
+];
+
+macro_rules! probe {
+    ($name: expr, $version: expr) => {
+        if let Err(_) = pkg_config::probe_library($name) {
+            pkg_config::probe_library(&format!("{}-{}", $name, $version))
+                .expect(&format!("Failed to locate {}", $name));
+        }
+    };
+}
+
+fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
+    // Build the backend implementation through CMake
+    let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
+    let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
+
+    let mut install_path = PathBuf::from(install_path);
+    if !install_path.is_absolute() {
+        install_path = absolute(out_dir).expect("cannot happen").join(install_path);
+    }
+
+    let _ = cmake::Config::new(".")
+        .uses_cxx11()
+        .generator("Ninja")
+        .profile(match is_debug {
+            true => "Debug",
+            false => "Release",
+        })
+        .env("OPT_LEVEL", opt_level)
+        .define("CMAKE_INSTALL_PREFIX", &install_path)
+        .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
+        .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
+        .build();
+
+    // Additional transitive CMake dependencies
+    let deps_folder = out_dir.join("build").join("_deps");
+    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
+        let dep_name = match is_debug {
+            true => format!("{}d", dependency),
+            false => String::from(dependency),
+        };
+        let dep_path = deps_folder.join(format!("{}-build", dependency));
+        println!("cargo:rustc-link-search={}", dep_path.display());
+        println!("cargo:rustc-link-lib=static={}", dep_name);
+    }
+
+    // Emit linkage information from the artifacts we just built
+    let install_lib_path = install_path.join("lib");
+
+    println!(
+        r"cargo:warning=Adding link search path: {}",
+        install_lib_path.display()
+    );
+    println!(r"cargo:rustc-link-search={}", install_lib_path.display());
+
+    (PathBuf::from(install_path), deps_folder)
+}
+
+fn build_ffi_layer(deps_folder: &PathBuf) {
+    CFG.include_prefix = "backends/trtllm";
+    cxx_build::bridge("src/lib.rs")
+        .static_flag(true)
+        .include(deps_folder.join("fmt-src").join("include"))
+        .include(deps_folder.join("spdlog-src").join("include"))
+        .include(deps_folder.join("json-src").join("include"))
+        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
+        .include("/usr/local/cuda/include")
+        .include("/usr/local/tensorrt/include")
+        .file("src/ffi.cpp")
+        .std("c++20")
+        .compile("tgi_trtllm_backend");
+
+    println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=include/backend.h");
+    println!("cargo:rerun-if-changed=lib/backend.cpp");
+    println!("cargo:rerun-if-changed=include/ffi.h");
+    println!("cargo:rerun-if-changed=src/ffi.cpp");
+}
+
+fn main() {
+    // Misc variables
+    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let build_profile = env::var("PROFILE").unwrap();
+    let (is_debug, opt_level) = match build_profile.as_ref() {
+        "debug" => (true, "0"),
+        _ => (false, "3"),
+    };
+
+    // Build the backend
+    let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
+
+    // Build the FFI layer calling the backend above
+    build_ffi_layer(&deps_folder);
+
+    // Emit linkage search path
+    probe!("ompi", MPI_REQUIRED_VERSION);
+
+    // Probe CUDA & co. with pkg-config
+    CUDA_TRANSITIVE_DEPS.iter().for_each(|name| {
+        probe!(name, CUDA_REQUIRED_VERSION);
+    });
+
+    // NCCL is slightly trickier because it might not have a pkgconfig installed
+    let nccl_library_path_default = format!("/usr/local/{}-linux-gnu", ARCH);
+    let nccl_library_path = NCCL_ROOT_DIR.unwrap_or(&nccl_library_path_default);
+    println!(r"cargo:rustc-link-search=native={}", nccl_library_path);
+    println!("cargo:rustc-link-lib=dylib=nccl");
+
+    // TensorRT
+    let tensort_library_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt/lib");
+    println!(r"cargo:rustc-link-search=native={}", tensort_library_path);
+    println!("cargo:rustc-link-lib=dylib=nvinfer");
+
+    // TensorRT-LLM
+    TENSORRT_LLM_TRANSITIVE_DEPS
+        .iter()
+        .for_each(|(link_type, name)| {
+            println!("cargo:rustc-link-lib={}={}", link_type, name);
+        });
+
+    // Backend
+    BACKEND_DEPS.iter().for_each(|name| {
+        println!("cargo:rustc-link-lib=static={}", name);
+    });
+}
--- a/backends/trtllm/cmake/fmt.cmake
+++ b/backends/trtllm/cmake/fmt.cmake
@ -0,0 +1,6 @@
+FetchContent_Declare(
+        fmt
+        GIT_REPOSITORY https://github.com/fmtlib/fmt
+        GIT_TAG 11.0.1
+)
+FetchContent_MakeAvailable(fmt)
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@ -0,0 +1,5 @@
+fetchcontent_declare(
+        json
+        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+)
+fetchcontent_makeavailable(json)
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@ -0,0 +1,17 @@
+set(SPDLOG_USE_FMT ON)
+set(SPDLOG_BUILD_SHARED OFF)
+set(SPDLOG_FMT_EXTERNAL ON)
+
+# Define the level at which SPDLOG_ compilation level is defined
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+else ()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+endif ()
+
+fetchcontent_declare(
+        spdlog
+        GIT_REPOSITORY https://github.com/gabime/spdlog.git
+        GIT_TAG v1.14.1
+)
+fetchcontent_makeavailable(spdlog)
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@ -0,0 +1,42 @@
+set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
+
+set(USE_CXX11_ABI ON)
+set(BUILD_PYT OFF)
+set(BUILD_PYBIND OFF)
+set(BUILD_MICRO_BENCHMARKS OFF)
+set(BUILD_BENCHMARKS OFF)
+set(BUILD_TESTS OFF)
+set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
+
+message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    set(FAST_BUILD ON)
+    set(NVTX_DISABLE OFF)
+else ()
+    set(FAST_BUILD OFF)
+    set(FAST_MATH ON)
+    set(NVTX_DISABLE ON)
+endif ()
+
+fetchcontent_declare(
+        trtllm
+        GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
+        GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
+        GIT_SHALLOW FALSE
+)
+fetchcontent_makeavailable(trtllm)
+
+message(STATUS "Found TensorRT-LLM: ${trtllm_SOURCE_DIR}")
+execute_process(COMMAND git lfs install WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+execute_process(COMMAND git lfs pull WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+
+# TRTLLM use a JIT based *precompiled* library to generate some specific kernels, we are generating the path to this one here
+set(TRTLLM_NVRTC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_nvrtc_wrapper${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE INTERNAL "nvrtc wrapper library name")
+set(TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_NVRTC_LIBRARY_NAME}"
+        CACHE INTERNAL "nvrtc wrapper library path")
+
+# The same Executor Static library
+set(TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_executor_static${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE INTERNAL "executor_static library name")
+set(TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/executor/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME}" CACHE INTERNAL "executor_static library path")
--- a/backends/trtllm/cmake/utils/detect_cuda_arch.cu
+++ b/backends/trtllm/cmake/utils/detect_cuda_arch.cu
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -0,0 +1,121 @@
+//
+// Created by Morgan Funtowicz on 6/30/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_H
+#define TGI_TRTLLM_BACKEND_H
+
+#include <cmath>
+#include <filesystem>
+#include <span>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+#include <tensorrt_llm/runtime/common.h>
+#include <tensorrt_llm/executor/executor.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>
+
+using json = nlohmann::json;
+namespace tle = tensorrt_llm::executor;
+
+namespace huggingface::tgi::backends {
+    using RequestId = tle::IdType;
+    using TokenId = tle::TokenIdType;
+
+    /**
+     * Initialize all the components required by TRTLLM.
+     * It is required to call this function before attempting to load any engine
+     */
+    void InitializeBackend();
+
+    /**
+     *
+     * @param config TensorRT-LLM configuration object
+     * @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
+     * @return
+     */
+    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
+
+    /**
+     * Get the sampling configuration from the parameters provided by TGI
+     * @param topK
+     * @param topP
+     * @param temperature
+     * @param repetition_penalty
+     * @param frequency_penalty
+     * @param seed
+     * @return
+     */
+    tle::SamplingConfig GetSamplingConfig(
+            uint32_t topK,
+            float_t topP,
+            float_t temperature,
+            float_t repetition_penalty,
+            float_t frequency_penalty,
+            uint64_t seed
+    );
+
+    /**
+     *
+     */
+    class TensorRtLlmBackend {
+    private:
+        const json config;
+        tle::Executor executor;
+
+    public:
+        explicit TensorRtLlmBackend(
+                const std::filesystem::path &engineFolder,
+                const std::filesystem::path &executorWorker
+        );
+
+        /**
+         * Indicate if the backend is ready to accept incoming request
+         * @return true if ready, false otherwise
+         */
+        [[nodiscard]] bool IsReady() const;
+
+        /**
+         * Query the executor for the number of token available for pulling
+         * @return
+         */
+        [[nodiscard]] size_t NumResponsesReady() const;
+
+        /**
+         * Submit a new generation task to the executor
+         * @param tokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param repetition_penalty
+         * @param frequency_penalty
+         * @param seed
+         * @return Request id related to this generation for reference
+         */
+        [[nodiscard]] RequestId Submit(
+                const std::vector<TokenId> &tokens,
+                int32_t topK,
+                float_t topP,
+                float_t temperature,
+                float_t repetition_penalty,
+                float_t frequency_penalty,
+                uint64_t seed
+        );
+
+        /**
+         *
+         * @param requestId The request id to poll the generation results
+         * @return
+         */
+        std::vector<tle::Response> Poll(RequestId requestId);
+
+        /**
+         * Stop the underlying executor
+         */
+        void Shutdown();
+    };
+}
+
+
+#endif //TGI_TRTLLM_BACKEND_H
--- a/backends/trtllm/include/ffi.h
+++ b/backends/trtllm/include/ffi.h
@ -0,0 +1,75 @@
+//
+// Created by mfuntowicz on 7/11/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_FFI_H
+#define TGI_TRTLLM_BACKEND_FFI_H
+
+#include <cstddef>
+#include "backend.h"
+
+namespace huggingface::tgi::backends {
+    class TensorRtLlmBackendImpl;
+}
+
+#include "backends/trtllm/src/lib.rs.h"
+
+
+namespace huggingface::tgi::backends {
+
+//    struct GenerationContext;
+
+    class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
+    public:
+        /***
+         *
+         * @param engineFolder
+         * @param executorWorker
+         */
+        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
+
+        /***
+         *
+         * @return
+         */
+        bool IsReady() const;
+
+        /***
+         *
+         * @param tokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param repetition_penalty
+         * @param frequency_penalty
+         * @param seed
+         * @return
+         */
+        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
+        uint64_t
+        Submit(rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature,
+               float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
+
+        /***
+         *
+         * @param requestId
+         * @param ctx
+         * @param callback
+         * @return
+         */
+        size_t StreamTokens(
+                const RequestId requestId,
+                huggingface::tgi::backends::GenerationContext *ctx,
+                rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
+                              huggingface::tgi::backends::GenerationStep)> callback);
+    };
+
+    /***
+    *
+    * @param engineFolder
+    * @return
+    */
+    std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
+}
+
+#endif //TGI_TRTLLM_BACKEND_FFI_H
--- a/backends/trtllm/include/hardware.h
+++ b/backends/trtllm/include/hardware.h
@ -0,0 +1,59 @@
+//
+// Created by mfuntowicz on 7/23/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
+#define TGI_TRTLLM_BACKEND_HARDWARE_H
+
+#include <cstdint>
+#include <limits>
+#include <fmt/base.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+namespace huggingface::hardware::cuda {
+
+#define AMPERE_SM_MAJOR 8
+#define HOPPER_SM_MAJOR 8
+
+    /**
+     * Store information about the version of the CUDA Compute Capabilities detected on the device
+     */
+    struct CudaComputeCapabilities {
+        int32_t major;
+        int32_t minor;
+
+        [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
+
+        [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
+    };
+
+    CudaComputeCapabilities GetCudaComputeCapabilities() {
+        // Get the compute capabilities of the current hardware
+        nvmlDevice_t device;
+        CudaComputeCapabilities capabilities{0, 0};
+        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
+            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
+            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
+                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
+            }
+        }
+
+        return capabilities;
+    }
+
+    /**
+     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
+     * @return
+     */
+    std::optional<size_t> GetNumDevices() {
+        uint32_t numGpus = 0;
+        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
+            return std::optional(numGpus);
+        } else {
+            return std::nullopt;
+        }
+    }
+}
+
+#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -0,0 +1,146 @@
+#include <fstream>
+
+#include <fmt/ranges.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+#include "backend.h"
+#include "hardware.h"
+
+void huggingface::tgi::backends::InitializeBackend() {
+    SPDLOG_INFO("Initializing Backend...");
+    nvmlInit_v2();
+    initTrtLlmPlugins();
+
+    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
+    if (numGpus.has_value()) {
+        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
+    } else {
+        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
+    }
+}
+
+[[nodiscard]]
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
+    tle::ExecutorConfig execConfig(1);
+
+    // Retrieve the compute capabilities to enable some options at runtime
+    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
+
+    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
+    if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+        execConfig.setParallelConfig(tle::ParallelConfig(
+                tle::CommunicationType::kMPI,
+                tle::CommunicationMode::kLEADER,
+                std::nullopt,
+                std::nullopt,
+                std::nullopt
+        ));
+    } else { // Multiple engines -> using orchestrator mode (MPI involved)
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+        execConfig.setParallelConfig(tle::ParallelConfig(
+                tle::CommunicationType::kMPI,
+                tle::CommunicationMode::kORCHESTRATOR,
+                std::nullopt,
+                std::nullopt,
+                tle::OrchestratorConfig(true, workerPath, nullptr, true)
+        ));
+    }
+
+    // Define some configuration variables
+    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
+    execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
+    return execConfig;
+}
+
+tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
+        uint32_t topK,
+        float_t topP,
+        float_t temperature,
+        float_t repetition_penalty,
+        float_t frequency_penalty,
+        uint64_t seed) {
+    return tle::SamplingConfig(
+            1,  // TGI only use a single beam
+            topK,
+            topP,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            seed,
+            temperature,
+            temperature,
+            std::nullopt,
+            repetition_penalty,
+            std::nullopt,
+            frequency_penalty
+    );
+}
+
+huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
+        const std::filesystem::path &enginesFolder,
+        const std::filesystem::path &executorWorker
+) :
+        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
+        executor(
+                enginesFolder,
+                tensorrt_llm::executor::ModelType::kDECODER_ONLY,
+                GetExecutorConfig(config, executorWorker.string()
+                )) {
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string &>());
+}
+
+bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
+    return executor.canEnqueueRequests();
+}
+
+[[nodiscard("Returned number of requests needs to be consumed")]]
+size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
+    return executor.getNumResponsesReady();
+}
+
+[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
+tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
+        const std::vector<tle::TokenIdType> &tokens,
+        const int32_t topK,
+        const float_t topP,
+        const float_t temperature,
+        const float_t repetition_penalty,
+        const float_t frequency_penalty,
+        const uint64_t seed
+) {
+#ifdef NDEBUG
+    SPDLOG_DEBUG(
+            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
+            tokens.size(),
+            executor.getLatestIterationStats().back().numActiveRequests
+    );
+#else
+    SPDLOG_DEBUG(
+            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
+            fmt::join(tokens, ", "),
+            executor.getLatestIterationStats().front().numActiveRequests
+    );
+#endif
+
+    const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<size_t>();
+    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
+
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+    const auto output = tle::OutputConfig(true, false, false, true, false);
+    return executor.enqueueRequest(
+            tle::Request{tokens, maxNewTokens, true, sampling, output});
+}
+
+[[nodiscard("Generated tokens result must be used")]]
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
+    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
+    return executor.awaitResponses(requestId);
+}
+
+
+void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() {
+    SPDLOG_INFO("Shutting down executor");
+    executor.shutdown();
+}
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@ -0,0 +1,111 @@
+#!/bin/bash
+
+set -ex
+
+TRT_VER="10.2.0.19"
+CUDA_VER="12.5"
+CUDNN_VER="9.2.1.18-1"
+NCCL_VER="2.22.3-1+cuda12.5"
+CUBLAS_VER="12.5.3.2-1"
+NVRTC_VER="12.5.82-1"
+
+for i in "$@"; do
+    case $i in
+        --TRT_VER=?*) TRT_VER="${i#*=}";;
+        --CUDA_VER=?*) CUDA_VER="${i#*=}";;
+        --CUDNN_VER=?*) CUDNN_VER="${i#*=}";;
+        --NCCL_VER=?*) NCCL_VER="${i#*=}";;
+        --CUBLAS_VER=?*) CUBLAS_VER="${i#*=}";;
+        *) ;;
+    esac
+    shift
+done
+
+NVCC_VERSION_OUTPUT=$(nvcc --version)
+if [[ $(echo $NVCC_VERSION_OUTPUT | grep -oP "\d+\.\d+" | head -n 1) != ${CUDA_VER} ]]; then
+  echo "The version of pre-installed CUDA is not equal to ${CUDA_VER}."
+  exit 1
+fi
+
+install_ubuntu_requirements() {
+    apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates
+    ARCH=$(uname -m)
+    if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
+    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
+    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb
+    dpkg -i cuda-keyring_1.0-1_all.deb
+
+    apt-get update
+    if [[ $(apt list --installed | grep libcudnn9) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libcudnn9*
+    fi
+    if [[ $(apt list --installed | grep libnccl) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libnccl*
+    fi
+    if [[ $(apt list --installed | grep libcublas) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libcublas*
+    fi
+    if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
+    fi
+    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    apt-get install -y --no-install-recommends libcudnn9-cuda-12=${CUDNN_VER} libcudnn9-dev-cuda-12=${CUDNN_VER}
+    apt-get install -y --no-install-recommends libnccl2=${NCCL_VER} libnccl-dev=${NCCL_VER}
+    apt-get install -y --no-install-recommends libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER}
+    # NVRTC static library doesn't exist in NGC PyTorch container.
+    NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+}
+
+install_centos_requirements() {
+    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    yum -y update
+    yum -y install epel-release
+    yum remove -y libnccl* && yum -y install libnccl-${NCCL_VER} libnccl-devel-${NCCL_VER}
+    yum remove -y libcublas* && yum -y install libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER} libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}
+    yum clean all
+}
+
+install_tensorrt() {
+    #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
+    #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
+    TRT_CUDA_VERSION="12.5"
+
+    if [ -z "$RELEASE_URL_TRT" ];then
+        ARCH=${TRT_TARGETARCH}
+        if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
+        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
+        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
+        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
+        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+    fi
+    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
+    tar -xf /tmp/TensorRT.tar -C /usr/local/
+    mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt
+    # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
+    rm -rf /tmp/TensorRT.tar
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  debian)
+    install_ubuntu_requirements
+    install_tensorrt
+    ;;
+  ubuntu)
+    install_ubuntu_requirements
+    install_tensorrt
+    ;;
+  centos)
+    install_centos_requirements
+    install_tensorrt
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@ -0,0 +1,329 @@
+use std::future::Future;
+use std::path::Path;
+use std::pin::{pin, Pin};
+use std::str::FromStr;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, OnceLock};
+use std::task::{Context, Poll};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use cxx::UniquePtr;
+use log::{error, warn};
+use tokenizers::Tokenizer;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
+use tokio::sync::RwLock;
+use tokio::time::{sleep, Instant};
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tokio_stream::{Stream, StreamExt};
+use tracing::{instrument, span, Level};
+
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidationError::UnsupportedModality;
+use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};
+use text_generation_router::{FinishReason, Token};
+
+use crate::errors::TensorRtLlmBackendError;
+use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+
+// Value used to poll the state of the generation stream
+static POLLING_INTERVAL_US: OnceLock<u64> = OnceLock::new();
+
+type InferResult<T> = Result<T, InferError>;
+
+pub(crate) struct Generation {
+    executor: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
+    done: Arc<AtomicBool>,
+}
+
+/// Holds the user provided input to be executed along with a channel allowing
+/// to bubble up all the generated tokens for that tokens the to end stream.
+pub struct GenerationContext {
+    sender: UnboundedSender<InferResult<InferStreamResponse>>,
+    tokenizer: Arc<Tokenizer>,
+    tokens: Vec<u32>,
+    done: Arc<AtomicBool>,
+    queued: Instant,
+    start: Option<Instant>,
+}
+
+impl Stream for Generation {
+    type Item = usize;
+
+    fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let interval = POLLING_INTERVAL_US.get_or_init(|| {
+            u64::from_str(option_env!("TRTLLM_BACKEND_POLLING_INTERVAL_US").unwrap_or("100"))
+                .expect("Invalid value provided for envvar POLLING_INTERVAL_US")
+        });
+
+        if !self.done.load(Ordering::Relaxed) {
+            let backend = pin!(self.executor.read());
+            let status = match backend.poll(ctx) {
+                Poll::Ready(executor_r) => {
+                    let ready = executor_r.num_responses_ready();
+                    if ready == 0 {
+                        Poll::Pending
+                    } else {
+                        Poll::Ready(Some(ready))
+                    }
+                }
+                Poll::Pending => Poll::Pending,
+            };
+
+            let waker = ctx.waker().clone();
+            tokio::spawn(async {
+                sleep(Duration::from_micros(*interval)).await;
+                waker.wake();
+            });
+
+            status
+        } else {
+            Poll::Ready(None) // end of stream
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (1, None)
+    }
+}
+
+unsafe impl Send for TensorRtLlmBackendImpl {}
+unsafe impl Sync for TensorRtLlmBackendImpl {}
+
+/// Implements the logic to execute generation with TensorRT-LLM executor API in background
+pub struct TensorRtLlmBackend {
+    tokenizer: Arc<Tokenizer>,
+
+    // Backing the backend behind a RwLock to allow concurrent read access to retrieve
+    // the number of available tokens (read only) in the Generation stream
+    backend: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
+}
+
+impl TensorRtLlmBackend {
+    pub fn new<P: AsRef<Path> + Send + 'static, PP: AsRef<Path> + Send + 'static>(
+        tokenizer: Tokenizer,
+        engine_folder: P,
+        executor_worker_path: PP,
+    ) -> Result<Self, TensorRtLlmBackendError> {
+        Ok(TensorRtLlmBackend {
+            tokenizer: Arc::new(tokenizer),
+            backend: Arc::new(RwLock::new(create_tensorrt_llm_backend(
+                engine_folder.as_ref().to_str().unwrap(),
+                executor_worker_path.as_ref().to_str().unwrap(),
+            ))),
+        })
+    }
+
+    fn validate(request: &ValidGenerateRequest) -> InferResult<&String> {
+        if request.top_n_tokens > 1 {
+            return Err(InferError::ValidationError(
+                ValidationError::TopNTokensDisabled,
+            ));
+        }
+
+        // TODO: Is it really needed? How can it be validated before?
+        if request.parameters.grammar.is_some() {
+            return Err(InferError::ValidationError(ValidationError::Grammar));
+        }
+
+        match request.inputs.len() {
+            0 => Err(InferError::ValidationError(ValidationError::EmptyInput)),
+            2.. => Err(InferError::GenerationError(
+                "TensorRT-LLM backend don't support multi-chunk".into(),
+            )),
+            1 => match request.inputs.first().expect("Single item-chunk") {
+                Chunk::Text(text) => Ok(text),
+                Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))),
+            },
+        }
+    }
+
+    fn generate(
+        &self,
+        sender: UnboundedSender<InferResult<InferStreamResponse>>,
+        tokens: Vec<u32>,
+        top_k: u32,
+        top_p: f32,
+        temperature: f32,
+        repetition_penalty: f32,
+        frequency_penalty: f32,
+        seed: u64,
+    ) {
+        let tokenizer = Arc::clone(&self.tokenizer);
+        let executor = Arc::clone(&self.backend);
+
+        // Let's push this in async context
+        tokio::spawn(async move {
+            // Define the generation state
+            let mut generation = Generation {
+                executor: executor.clone(),
+                done: Arc::new(AtomicBool::new(false)),
+            };
+
+            // Define the context over the generation
+            // TODO(asap): Do we really need so many shared-ownership?
+            let ctx = Box::new(GenerationContext {
+                sender: sender.clone(),
+                tokenizer,
+                tokens: vec![],
+                done: Arc::clone(&generation.done),
+                start: None,
+                queued: Instant::now(),
+            });
+
+            // We are leaking the context on-purpose to avoid the box being dropped while there are
+            // still computation ongoing
+            // TODO(asap): Can we achieve the same with an Arc<Box<T>> without the need to go unsafe?
+            let ctx_ = Box::leak(ctx);
+
+            // Submit the request to the batcher
+            let request_id = span!(Level::DEBUG, "submit")
+                .in_scope(|| async {
+                    let mut handle = executor.write().await;
+                    let request_id = handle.pin_mut().submit(
+                        &tokens,
+                        top_k as i32,
+                        top_p,
+                        temperature,
+                        repetition_penalty,
+                        frequency_penalty,
+                        seed,
+                    );
+
+                    request_id
+                })
+                .await;
+
+            while let Some(_) = generation.next().await {
+                let mut executor_w = executor.write().await;
+                let executor = executor_w.pin_mut();
+
+                span!(Level::DEBUG, "decode")
+                    .in_scope(|| async {
+                        unsafe {
+                            executor.stream_tokens(
+                                request_id,
+                                ctx_,
+                                |ctx: *mut GenerationContext, step: GenerationStep| {
+                                    let inner_ctx = &mut *ctx;
+
+                                    // Update the timestamp at which the request started effectively
+                                    // Can be a bit off, would need to be before the callback, let's see
+                                    inner_ctx.start.get_or_insert(Instant::now());
+                                    inner_ctx.done.store(step.is_final, Ordering::Relaxed);
+
+                                    // Ensure we are not running into errors
+                                    let parcel = if !step.has_error {
+                                        // Insert the latest generated token to the tracker
+                                        inner_ctx.tokens.push(step.token_id);
+
+                                        // Decode the token
+                                        let text = inner_ctx
+                                            .tokenizer
+                                            .decode(&[step.token_id], true)
+                                            .expect("Failed to decode token");
+
+                                        let special = inner_ctx
+                                            .tokenizer
+                                            .get_added_vocabulary()
+                                            .is_special_token(&text);
+
+                                        // Create the structure holding the token
+                                        let token = Token {
+                                            id: step.token_id,
+                                            text,
+                                            logprob: step.log_prob,
+                                            special,
+                                        };
+
+                                        if step.is_final {
+                                            let generated_text = inner_ctx
+                                                .tokenizer
+                                                .decode(&inner_ctx.tokens, true)
+                                                .expect("Failed to decode generated_tokens");
+
+                                            Ok(InferStreamResponse::End {
+                                                token,
+                                                top_tokens: vec![],
+                                                generated_text: GeneratedText {
+                                                    text: generated_text,
+                                                    generated_tokens: inner_ctx.tokens.len() as u32,
+                                                    finish_reason: FinishReason::EndOfSequenceToken,
+                                                    seed: None,
+                                                },
+                                                start: inner_ctx.start.unwrap_or(Instant::now()),
+                                                queued: inner_ctx.queued,
+                                            })
+                                        } else {
+                                            Ok(InferStreamResponse::Intermediate {
+                                                token,
+                                                top_tokens: vec![],
+                                            })
+                                        }
+                                    } else {
+                                        error!("Error caught while decoding: {}", &step.error_msg);
+                                        Err(InferError::GenerationError(step.error_msg))
+                                    };
+
+                                    // Send the parcel to the client
+                                    inner_ctx
+                                        .sender
+                                        .send(parcel)
+                                        .expect("Failed to sent msg through the channel");
+                                },
+                            );
+                        }
+                    })
+                    .await;
+            }
+
+            // "Properly" free the shared context...
+            // TODO: clean that piece of sh** asap
+            unsafe {
+                let _ = Box::from_raw(ctx_);
+            }
+        });
+    }
+}
+
+#[async_trait]
+impl Backend for TensorRtLlmBackend {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> InferResult<UnboundedReceiverStream<InferResult<InferStreamResponse>>> {
+        // Let's add a few more validation
+        let input = TensorRtLlmBackend::validate(&request)?;
+
+        // Channel to stream the generated token as they come from the worker thread back to the transport layer
+        let (sender, receiver) = unbounded_channel();
+
+        // Unpack parameters
+        let params = &request.parameters;
+
+        // Preprocess the inputs to send to TRTLLM backend
+        let encoding = self
+            .tokenizer
+            .encode(input.as_str(), true)
+            .map_err(|e| InferError::GenerationError(e.to_string()))?;
+
+        // Generate the response
+        self.generate(
+            sender,
+            Vec::from(encoding.get_ids()),
+            params.top_k,
+            params.top_p,
+            params.temperature,
+            params.repetition_penalty,
+            params.frequency_penalty,
+            params.seed,
+        );
+
+        Ok(UnboundedReceiverStream::new(receiver))
+    }
+
+    async fn health(&self, _current_health: bool) -> bool {
+        true
+    }
+}
--- a/backends/trtllm/src/errors.rs
+++ b/backends/trtllm/src/errors.rs
@ -0,0 +1,15 @@
+use thiserror::Error;
+
+use text_generation_router::server;
+
+#[derive(Debug, Error)]
+pub enum TensorRtLlmBackendError {
+    #[error("Tokenizer error: {0}")]
+    Tokenizer(String),
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -0,0 +1,84 @@
+//
+// Created by mfuntowicz on 6/30/24.
+//
+#pragma once
+
+#include <cmath>
+#include <exception>
+#include <filesystem>
+#include <limits>
+#include <iterator>
+#include <vector>
+
+#include <spdlog/spdlog.h>
+#include "backends/trtllm/include/ffi.h"
+
+
+huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
+        const std::string_view &engineFolder,
+        const std::string_view &executorWorker
+) : TensorRtLlmBackend(engineFolder, executorWorker) {}
+
+
+bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const {
+    return TensorRtLlmBackend::IsReady();
+}
+
+uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
+        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty,
+        float_t frequency_penalty, uint64_t seed) {
+
+    // This will copy all the items from the initial slice
+    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
+    return TensorRtLlmBackend::Submit(
+            std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+}
+
+size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
+        const uint64_t requestId,
+        huggingface::tgi::backends::GenerationContext *ctx,
+        rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
+                      huggingface::tgi::backends::GenerationStep)> callback) {
+
+    size_t numTokens = 0;
+    for (const auto &item: Poll(requestId)) {
+        GenerationStep step;
+        if (!item.hasError()) {
+            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
+            const auto decoded = item.getResult();
+
+            const auto token = decoded.outputTokenIds[0][0];
+            const auto isFinal = decoded.isFinal;
+            const auto logProb = decoded.logProbs.value()[0][0];
+
+            ++numTokens;
+
+            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
+            step = huggingface::tgi::backends::GenerationStep{
+                    static_cast<uint32_t>(token), logProb, isFinal, false, std::move(std::string())
+            };
+            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
+        } else {
+            // TODO : Return rest::Result with error
+            const auto what = item.getErrorMsg();
+            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", what);
+            step = huggingface::tgi::backends::GenerationStep{
+                    std::numeric_limits<uint32_t>::max(), 0.0, true, true, std::move(what)
+            };
+        }
+
+        callback(std::move(ctx), std::move(step));
+    }
+
+    return numTokens;
+}
+
+std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
+huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
+    // Unconditionally call this to initialize and discover TRTLLM plugins
+    InitializeBackend();
+
+    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
+    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
+    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
+}
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -0,0 +1,78 @@
+pub use backend::{GenerationContext, TensorRtLlmBackend};
+
+mod backend;
+pub mod errors;
+
+#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+mod ffi {
+
+    /// Struct used as shared type between rust and C++ to represent the result
+    /// of a single decoding iteration
+    pub struct GenerationStep {
+        token_id: u32,
+        log_prob: f32,
+        is_final: bool,
+        has_error: bool,
+        error_msg: String,
+    }
+
+    extern "Rust" {
+        type GenerationContext;
+    }
+
+    unsafe extern "C++" {
+        include!("backends/trtllm/src/ffi.cpp");
+
+        /// Represent an instance of the underlying TensorRT-LLM backend
+        type TensorRtLlmBackendImpl;
+
+        /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
+        ///
+        /// # Arguments
+        ///
+        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
+        /// * `executor_worker`: Path to the TRTLLM executor worker
+        ///
+        /// returns: <unknown>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
+        #[rust_name = "create_tensorrt_llm_backend"]
+        fn CreateTensorRtLlmBackend(
+            engine_folder: &str,
+            executor_worker: &str,
+        ) -> UniquePtr<TensorRtLlmBackendImpl>;
+
+        // #[rust_name = "is_ready"]
+        // fn IsReady(self: &TensorRtLlmBackendImpl) -> bool;
+
+        #[rust_name = "num_responses_ready"]
+        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
+
+        #[rust_name = "submit"]
+        fn Submit(
+            self: Pin<&mut TensorRtLlmBackendImpl>,
+            tokens: &[u32],
+            top_k: i32,
+            top_p: f32,
+            temperature: f32,
+            repetition_penalty: f32,
+            frequency_penalty: f32,
+            seed: u64,
+        ) -> u64;
+
+        #[rust_name = "stream_tokens"]
+        unsafe fn StreamTokens(
+            self: Pin<&mut TensorRtLlmBackendImpl>,
+            request_id: u64,
+            ctx: *mut GenerationContext,
+            cb: unsafe fn(*mut GenerationContext, GenerationStep),
+        ) -> usize;
+
+        // #[rust_name = "shutdown"]
+        // fn Shutdown(self: Pin<&mut TensorRtLlmBackendImpl>);
+    }
+}
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@ -0,0 +1,166 @@
+use std::collections::HashMap;
+use std::path::PathBuf;
+
+use clap::Parser;
+use tokenizers::{FromPretrainedParameters, Tokenizer};
+
+use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
+use text_generation_backends_trtllm::TensorRtLlmBackend;
+use text_generation_router::server;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(long, env, required = true)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(long, env)]
+    model_id: String,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env, default_value_t = false)]
+    messages_api_enabled: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(long, env)]
+    auth_token: Option<String>,
+    #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
+    executor_worker: PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), TensorRtLlmBackendError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        hostname,
+        port,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        model_id,
+        validation_workers,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        messages_api_enabled,
+        max_client_batch_size,
+        auth_token,
+        executor_worker,
+    } = args;
+
+    // Launch Tokio runtime
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    if !executor_worker.exists() {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(format!(
+            "`executor_work` specified path doesn't exists: {}",
+            executor_worker.display()
+        )));
+    }
+
+    // Run server
+    let tokenizer = Tokenizer::from_pretrained(
+        tokenizer_name.clone(),
+        Some(FromPretrainedParameters {
+            revision: revision.clone().unwrap_or(String::from("main")),
+            user_agent: HashMap::new(),
+            auth_token,
+        }),
+    )
+    .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?;
+
+    let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?;
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        None,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        hostname,
+        port,
+        cors_allow_origin,
+        false,
+        None,
+        None,
+        messages_api_enabled,
+        true,
+        max_client_batch_size,
+    )
+    .await?;
+    Ok(())
+}
--- a/backends/trtllm/tests/infer_test.cpp
+++ b/backends/trtllm/tests/infer_test.cpp
@ -0,0 +1,14 @@
+//
+// Created by mfuntowicz on 7/2/24.
+//
+#include <catch2/catch_all.hpp>
+#include <spdlog/spdlog.h>
+#include "../include/backend.h"
+
+TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
+    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
+    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
+
+    spdlog::info("Loading config from: {}", absolute(engines).string());
+    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
+}
--- a/backends/v3/Cargo.toml
+++ b/backends/v3/Cargo.toml
@ -0,0 +1,66 @@
+[package]
+name = "text-generation-router-v3"
+description = "Text Generation Webserver"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "text-generation-router"
+path = "src/main.rs"
+
+[dependencies]
+async-trait = "0.1.74"
+async-stream = "0.3.5"
+axum = { version = "0.7", features = ["json"] }
+axum-tracing-opentelemetry = "0.16"
+text-generation-router = { path = "../../router" }
+clap = { version = "4.4.5", features = ["derive", "env"] }
+grpc-metadata = { path = "../grpc-metadata" }
+futures = "0.3.28"
+hf-hub = { workspace = true }
+jsonschema = { version = "0.17.1", features = ["draft202012"] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+nohash-hasher = "0.2.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
+rand = "0.8.5"
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+thiserror = "1.0.48"
+tokenizers = { workspace = true}
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.14"
+tower-http = { version = "0.5.1", features = ["cors"] }
+tracing = "0.1.37"
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+utoipa = { version = "4.2.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
+init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
+minijinja = { version = "2.0.2" }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+futures-util = "0.3.30"
+regex = "1.10.3"
+once_cell = "1.19.0"
+image = "0.25.1"
+base64 = { workspace = true }
+prost = "^0.12"
+tonic = "^0.10"
+tower = "^0.4"
+
+[build-dependencies]
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
+
+[features]
+default = ["ngrok"]
+ngrok = ["text-generation-router/ngrok"]
+google = ["text-generation-router/google"]
+kserve = ["text-generation-router/kserve"]
--- a/backends/v3/build.rs
+++ b/backends/v3/build.rs
@ -0,0 +1,19 @@
+use std::fs;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    println!("cargo:rerun-if-changed=../../proto/");
+
+    fs::create_dir_all("src/client/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/client/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    Ok(())
+}
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -0,0 +1,508 @@
+use crate::client::{Batch, CachedBatch, ClientError, Generation, Health, ShardedClient};
+/// Batching and inference logic
+use crate::queue::{Entry, Queue};
+use async_trait::async_trait;
+use nohash_hasher::IntMap;
+use std::sync::Arc;
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidGenerateRequest;
+use text_generation_router::{FinishReason, PrefillToken, Token};
+use tokio::sync::mpsc::error::SendError;
+use tokio::sync::{mpsc, Notify};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{info_span, instrument, Instrument, Span};
+
+pub struct BackendV3 {
+    /// Request queue
+    queue: Queue,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
+    /// Client clone, used for health checks to skip the queue
+    client: ShardedClient,
+}
+
+impl BackendV3 {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        client: ShardedClient,
+        waiting_served_ratio: f32,
+        max_batch_prefill_tokens: u32,
+        max_batch_total_tokens: u32,
+        max_waiting_tokens: usize,
+        max_batch_size: Option<usize>,
+        requires_padding: bool,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
+        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
+            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
+        } else {
+            false
+        };
+        let block_size = if flashdecoding { 256 } else { 16 };
+
+        let queue = Queue::new(
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+            max_batch_total_tokens,
+        );
+        let batching_task_notifier = Arc::new(Notify::new());
+
+        // Spawn batching background task that contains all the inference logic
+        tokio::spawn(batching_task(
+            client.clone(),
+            waiting_served_ratio,
+            max_batch_prefill_tokens,
+            max_batch_total_tokens,
+            max_waiting_tokens,
+            max_batch_size,
+            queue.clone(),
+            batching_task_notifier.clone(),
+        ));
+
+        Self {
+            queue,
+            batching_task_notifier,
+            client,
+        }
+    }
+}
+
+#[async_trait]
+impl Backend for BackendV3 {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        // MPSC channel to communicate with the background batching task
+        let (response_tx, response_rx) = mpsc::unbounded_channel();
+
+        // Append the request to the queue
+        self.queue.append(Entry {
+            request,
+            response_tx,
+            span: Span::current(),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+            block_allocation: None,
+        });
+
+        // Notify the background task that we have a new entry in the queue that needs
+        // to be batched
+        self.batching_task_notifier.notify_one();
+
+        // Return stream
+        Ok(UnboundedReceiverStream::new(response_rx))
+    }
+
+    async fn health(&self, current_health: bool) -> bool {
+        if current_health {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok()
+    }
+}
+
+/// Batching logic
+/// Will be launched in a background Tokio task
+///
+/// Batches requests and sends them to the inference server
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn batching_task(
+    mut client: ShardedClient,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: u32,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+    queue: Queue,
+    notifier: Arc<Notify>,
+) {
+    // Infinite loop
+    loop {
+        // Wait for a notification from the Infer struct
+        notifier.notified().await;
+
+        // Get the next batch from the queue
+        // This batch might be smaller than the maximum batch size if there are not enough requests
+        // waiting in the queue
+        while let Some((mut entries, batch, span)) = queue
+            .next_batch(
+                None,
+                max_batch_size,
+                max_batch_prefill_tokens,
+                max_batch_total_tokens,
+            )
+            .await
+        {
+            let mut cached_batch = prefill(&mut client, batch, &mut entries)
+                .instrument(span)
+                .await;
+            let mut waiting_tokens = 1;
+
+            // We loop until we do not receive any cached batch from the inference server (== until
+            // all requests have met their stopping criteria)
+            while let Some(batch) = cached_batch {
+                // Get current batch info
+                let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
+                let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
+                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
+
+                let min_size = if waiting_tokens >= max_waiting_tokens {
+                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                    // to add a new batch even though its size might be small
+                    None
+                } else {
+                    // Minimum batch size
+                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                };
+
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
+
+                // Try to get a new batch
+                if let Some((mut new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
+                    .await
+                {
+                    // Tracking metrics
+                    if min_size.is_some() {
+                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
+                            .increment(1);
+                    } else {
+                        metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
+                            .increment(1);
+                    }
+
+                    entries.iter_mut().for_each(|(_, entry)| {
+                        // Create a new span to add the info that this entry is waiting
+                        // because a new batch is being computed
+                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                        // Add relationships
+                        span.follows_from(&entry_waiting_span);
+                        entry_waiting_span.follows_from(&span);
+                        // Update entry
+                        entry.temp_span = Some(entry_waiting_span);
+                    });
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
+                        .instrument(span)
+                        .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        entries.extend(new_entries);
+                        batches.push(new_cached_batch);
+                    }
+                }
+
+                // Create span for this batch to add context to inference calls
+                let next_batch_size = entries.len();
+                let next_batch_span =
+                    info_span!(parent: None, "batch", batch_size = next_batch_size);
+                entries.iter_mut().for_each(|(_, entry)| {
+                    // Create a new span to link the batch back to this entry
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
+                    entry_batch_span.follows_from(&next_batch_span);
+                    // Update entry
+                    entry.temp_span = Some(entry_batch_span);
+                });
+
+                cached_batch = decode(&mut client, batches, &mut entries)
+                    .instrument(next_batch_span)
+                    .await;
+                waiting_tokens += 1;
+            }
+            metrics::gauge!("tgi_batch_current_size").set(0.0);
+            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_id = batch.id;
+    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
+
+    match client.prefill(batch).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "prefill")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            let _ = client.clear_cache(Some(batch_id)).await;
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
+    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
+                    .record(concat_duration.as_secs_f64());
+            }
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            for id in batch_ids {
+                let _ = client.clear_cache(Some(id)).await;
+            }
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
+            None
+        }
+    }
+}
+
+/// Filter a `batch` and remove all requests not present in `entries`
+#[instrument(skip_all)]
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<CachedBatch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
+    batch.request_ids.retain(|id| entries.contains_key(id));
+
+    if batch.request_ids.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.request_ids).await.unwrap()
+    }
+}
+
+/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
+/// and filter entries
+#[instrument(skip_all)]
+fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
+    generations.into_iter().for_each(|generation| {
+        let id = generation.request_id;
+        // Get entry
+        // We can `expect` here as the request id should always be in the entries
+        let entry = entries
+            .get(&id)
+            .expect("ID not found in entries. This is a bug.");
+
+        // Create and enter a span to link this function back to the entry
+        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
+        // Send generation responses back to the infer task
+        // If the receive an error from the Flume channel, it means that the client dropped the
+        // request and we need to stop generating hence why we unwrap_or(true)
+        let stopped = send_responses(generation, entry).map_err(|err| {
+            tracing::error!("Entry response channel error.");
+            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+            err
+        }).unwrap_or(true);
+        if stopped {
+            entries.remove(&id).expect("ID not found in entries. This is a bug.");
+        }
+    });
+}
+
+/// Send responses through the `entry` response channel
+fn send_responses(
+    generation: Generation,
+    entry: &Entry,
+) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
+    // Return directly if the channel is disconnected
+    if entry.response_tx.is_closed() {
+        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+        return Ok(true);
+    }
+
+    let mut stopped = false;
+
+    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
+        // Send message
+        entry
+            .response_tx
+            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
+    }
+
+    // Create last Token
+    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
+    let n = tokens_.ids.len();
+    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
+    let mut iterator = tokens_
+        .ids
+        .into_iter()
+        .zip(tokens_.logprobs)
+        .zip(tokens_.texts)
+        .zip(tokens_.is_special)
+        .enumerate()
+        .peekable();
+    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
+        let token = Token {
+            id,
+            text,
+            logprob,
+            special,
+        };
+        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
+            top_tokens_
+                .ids
+                .iter()
+                .zip(top_tokens_.logprobs.iter())
+                .zip(top_tokens_.texts.iter())
+                .zip(top_tokens_.is_special.iter())
+                .map(|(((&id, &logprob), text), &special)| Token {
+                    id,
+                    text: text.to_string(),
+                    logprob,
+                    special,
+                })
+                .collect()
+        } else {
+            vec![]
+        };
+        match (&generation.generated_text, iterator.peek()) {
+            (Some(generated_text), None) => {
+                // Generation has ended
+                stopped = true;
+                // Send message
+                entry.response_tx.send(Ok(InferStreamResponse::End {
+                    token,
+                    top_tokens,
+                    generated_text: GeneratedText::from(generated_text.clone()),
+                    queued: entry.queue_time,
+                    start: entry.batch_time.unwrap(),
+                }))?;
+            }
+            _ => {
+                // Send message
+                entry
+                    .response_tx
+                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
+            }
+        }
+    }
+
+    Ok(stopped)
+}
+
+/// Send errors to Infer for all `entries`
+#[instrument(skip_all)]
+fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
+    entries.drain().for_each(|(_, entry)| {
+        // Create and enter a span to link this function back to the entry
+        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
+        let err = InferError::GenerationError(error.to_string());
+        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
+        tracing::error!("{err}");
+
+        // unwrap_or is valid here as we don't care if the receiver is gone.
+        entry
+            .response_tx
+            .send(Err(err))
+            .unwrap_or(());
+    });
+}
+
+impl From<crate::client::GeneratedText> for GeneratedText {
+    fn from(value: crate::client::GeneratedText) -> Self {
+        let v3_finish_reason = crate::client::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v3_finish_reason {
+            crate::client::FinishReason::Length => FinishReason::Length,
+            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
+        };
+
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
+        }
+    }
+}
--- a/router/src/infer/v3/block_allocator.rs
+++ b/router/src/infer/v3/block_allocator.rs
--- a/backends/v3/src/client/grpc_client.rs
+++ b/backends/v3/src/client/grpc_client.rs
@ -0,0 +1,284 @@
+/// Single shard Client
+use crate::client::{pb, Chunk};
+use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
+use base64::engine::general_purpose::STANDARD;
+use base64::Engine;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut input_chunks = Vec::new();
+            input_chunks
+                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
+            if n_tokens == 0 {
+                input_chunks.push(
+                    Chunk::Image(Image {
+                        // Safe unwrap, because we control the data.
+                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
+                        mimetype: "image/jpeg;base64".to_string(),
+                    })
+                    .into(),
+                );
+            }
+
+            // Send stringly-typed inputs for compatibility for backends that haven't
+            // been updated to support chunks.
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                input_chunks: Some(Input {
+                    chunks: input_chunks,
+                }),
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Blocks and slots will be set on the server side if we use paged attention
+                blocks: vec![],
+                slots: vec![],
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+                adapter_id: None,
+            });
+            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: max_input_length,
+            max_blocks: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@ -0,0 +1,76 @@
+//! Text Generation gRPC client library
+
+use async_trait::async_trait;
+use thiserror::Error;
+use tonic::transport;
+use tonic::Status;
+
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod grpc_client;
+mod sharded_client;
+
+pub use grpc_client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters,
+};
+pub use sharded_client::ShardedClient;
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
+
+#[derive(Error, Debug, Clone)]
+pub enum ClientError {
+    #[error("Could not connect to Text Generation server: {0}")]
+    Connection(String),
+    #[error("Server error: {0}")]
+    Generation(String),
+    #[error("Sharded results are empty")]
+    EmptyResults,
+}
+
+impl From<Status> for ClientError {
+    fn from(err: Status) -> Self {
+        let err = Self::Generation(err.message().to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+impl From<transport::Error> for ClientError {
+    fn from(err: transport::Error) -> Self {
+        let err = Self::Connection(err.to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+// Small convenience re-wrapping of `Chunk`.
+impl From<Chunk> for InputChunk {
+    fn from(chunk: Chunk) -> Self {
+        InputChunk { chunk: Some(chunk) }
+    }
+}
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
+pub type Result<T> = std::result::Result<T, ClientError>;
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -0,0 +1,260 @@
+use crate::client::{ClientError, Result};
+/// Multi shard Client
+use crate::client::{Health, ShardInfo};
+
+use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
+use crate::client::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use crate::client::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+            // Block 0 is reserved for health checks
+            blocks: vec![0],
+            slots: (0..16).collect(),
+            adapter_id: None,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+            max_blocks: 1,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
--- a/backends/v3/src/lib.rs
+++ b/backends/v3/src/lib.rs
@ -0,0 +1,142 @@
+mod backend;
+mod block_allocator;
+mod client;
+mod queue;
+
+use crate::client::{ClientError, ShardedClient};
+pub(crate) use backend::BackendV3;
+use serde::Serialize;
+use thiserror::Error;
+use utoipa::ToSchema;
+
+#[derive(Clone, Debug, Serialize, ToSchema)]
+pub struct BackendInfo {
+    /// Mandatory
+    #[schema(example = "cuda")]
+    pub model_device_type: String,
+    #[schema(example = "torch.float16")]
+    pub model_dtype: String,
+
+    /// Backend parameters
+    #[schema(example = "1")]
+    pub speculate: usize,
+    #[schema(example = "1.2")]
+    pub waiting_served_ratio: f32,
+    #[schema(example = "32000")]
+    pub max_batch_total_tokens: u32,
+    #[schema(example = "20")]
+    pub max_waiting_tokens: usize,
+    #[schema(nullable = true, example = "null")]
+    pub max_batch_size: Option<usize>,
+}
+
+#[allow(clippy::too_many_arguments)]
+pub async fn connect_backend(
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    master_shard_uds_path: String,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: Option<u32>,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+) -> Result<(BackendV3, BackendInfo), V3Error> {
+    // Helper function
+    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+        match max_supported_batch_total_tokens {
+            // Older models do not support automatic max-batch-total-tokens
+            None => {
+                let max_batch_total_tokens = max_batch_total_tokens
+                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
+                tracing::warn!("Model does not support automatic max batch total tokens");
+                Ok(max_batch_total_tokens)
+            }
+            // Flash attention models return their max supported total tokens
+            Some(max_supported_batch_total_tokens) => {
+                // Warn if user added his own max-batch-total-tokens as we will ignore it
+                if max_batch_total_tokens.is_some() {
+                    tracing::warn!(
+                        "`--max-batch-total-tokens` is deprecated for Flash \
+                        Attention models."
+                    );
+                    tracing::warn!(
+                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
+                    );
+                }
+                if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                    return Err(V3Error::NotEnoughMemory(max_total_tokens));
+                }
+
+                Ok(max_supported_batch_total_tokens)
+            }
+        }
+    };
+
+    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
+        .await
+        .map_err(V3Error::Connection)?;
+
+    // server is running on v3
+    // Clear the cache; useful if the webserver rebooted
+    sharded_client
+        .clear_cache(None)
+        .await
+        .map_err(V3Error::Cache)?;
+    // Get info from the shard
+    let shard_info = sharded_client.info().await.map_err(V3Error::Info)?;
+
+    // Warmup model
+    tracing::info!("Warming up model");
+    let max_batch_total_tokens = check_max_batch_total_tokens(
+        sharded_client
+            .warmup(
+                max_input_tokens as u32,
+                max_batch_prefill_tokens,
+                max_total_tokens as u32,
+                max_batch_size,
+            )
+            .await
+            .map_err(V3Error::Warmup)?,
+    )?;
+    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
+
+    let backend_info = BackendInfo {
+        waiting_served_ratio,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        model_device_type: shard_info.device_type.clone(),
+        model_dtype: shard_info.dtype.clone(),
+        speculate: shard_info.speculate as usize,
+    };
+
+    let backend = BackendV3::new(
+        sharded_client,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        shard_info.requires_padding,
+        shard_info.window_size,
+        shard_info.speculate,
+    );
+
+    tracing::info!("Using backend V3");
+
+    Ok((backend, backend_info))
+}
+
+#[derive(Debug, Error)]
+pub enum V3Error {
+    #[error("Unable to clear the Python model shards cache: {0}")]
+    Cache(ClientError),
+    #[error("Unable to connect to the Python model shards: {0}")]
+    Connection(ClientError),
+    #[error("Unable to get the Python model shards info: {0}")]
+    Info(ClientError),
+    #[error("Unable to warmup the Python model shards: {0}")]
+    Warmup(ClientError),
+    #[error("Not enough memory to handle `max_total_tokens={0}`")]
+    NotEnoughMemory(usize),
+}
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@ -0,0 +1,204 @@
+use clap::{Parser, Subcommand};
+use text_generation_router::{server, usage_stats};
+use text_generation_router_v3::{connect_backend, V3Error};
+use thiserror::Error;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[command(subcommand)]
+    command: Option<Commands>,
+
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "1.2", long, env)]
+    waiting_served_ratio: f32,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "20", long, env)]
+    max_waiting_tokens: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
+    master_shard_uds_path: String,
+    #[clap(default_value = "bigscience/bloom", long, env)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    api_key: Option<String>,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env)]
+    ngrok: bool,
+    #[clap(long, env)]
+    ngrok_authtoken: Option<String>,
+    #[clap(long, env)]
+    ngrok_edge: Option<String>,
+    #[clap(long, env, default_value_t = false)]
+    messages_api_enabled: bool,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+}
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    PrintSchema,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        command,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        hostname,
+        port,
+        master_shard_uds_path,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        validation_workers,
+        api_key,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    } = args;
+
+    if let Some(Commands::PrintSchema) = command {
+        use utoipa::OpenApi;
+        let api_doc = text_generation_router::server::ApiDoc::openapi();
+        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    };
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(RouterError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    let (backend, _backend_info) = connect_backend(
+        max_input_tokens,
+        max_total_tokens,
+        master_shard_uds_path,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+    )
+    .await?;
+
+    // Run server
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        api_key,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        hostname,
+        port,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    )
+    .await?;
+    Ok(())
+}
+
+#[derive(Debug, Error)]
+enum RouterError {
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("Backend failed: {0}")]
+    Backend(#[from] V3Error),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
--- a/router/src/infer/v3/queue.rs
+++ b/router/src/infer/v3/queue.rs
@ -1,17 +1,17 @@
-use crate::infer::v3::block_allocator::{BlockAllocation, BlockAllocator};
-use crate::infer::InferError;
-use crate::infer::InferStreamResponse;
-use crate::validation::{
-    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+use crate::block_allocator::{BlockAllocation, BlockAllocator};
+use crate::client;
+use crate::client::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
 use std::cmp::{max, min};
 use std::collections::VecDeque;
-use text_generation_client::v3::{
-    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+use text_generation_router::infer::InferError;
+use text_generation_router::infer::InferStreamResponse;
+use text_generation_router::validation::{
+    Chunk, ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters,
+    ValidStoppingParameters,
 };
-use text_generation_client::ChunksToString;
-use text_generation_client::Input;
 use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Instrument, Span};
@ -337,8 +337,22 @@ impl State {
            batch_requests.push(Request {
                id,
                prefill_logprobs: entry.request.decoder_input_details,
-                input_chunks: Some(Input {
-                    chunks: entry.request.inputs.clone(),
+                input_chunks: Some(client::Input {
+                    chunks: entry
+                        .request
+                        .inputs
+                        .clone()
+                        .into_iter()
+                        .map(|c| client::InputChunk {
+                            chunk: Some(match c {
+                                Chunk::Text(text) => client::Chunk::Text(text),
+                                Chunk::Image(image) => client::Chunk::Image(client::Image {
+                                    data: image.data,
+                                    mimetype: image.mimetype,
+                                }),
+                            }),
+                        })
+                        .collect(),
                }),
                inputs: entry.request.inputs.chunks_to_string(),
                truncate: entry.request.truncate,
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -21,7 +21,7 @@ float-ord = "0.3.2"
 serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
 tabled = "0.14.0"
-text-generation-client = { path = "../router/client" }
+text-generation-client = { path = "../backends/client" }
 thiserror = "1.0.48"
 tokenizers = { workspace = true }
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
--- a/clients/python/README.md
+++ b/clients/python/README.md
@ -1,3 +1,6 @@
+# Legacy warning ⚠️
+The inference clients from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference) are recommended over `text_generation`.
+
 # Text Generation

 The Hugging Face Text Generation Python library provides a convenient way of interfacing with a
--- a/clients/python/text_generation/init.py
+++ b/clients/python/text_generation/init.py
@ -19,5 +19,15 @@ DEPRECATION_WARNING = (
    "Please use the `InferenceClient` from the `huggingface_hub` package instead."
 )

-from text_generation.client import Client, AsyncClient
-from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
+from text_generation.client import Client, AsyncClient  # noqa E402
+from text_generation.inference_api import (  # noqa E402
+    InferenceAPIClient,
+    InferenceAPIAsyncClient,
+)
+
+__all__ = [
+    "Client",
+    "AsyncClient",
+    "InferenceAPIClient",
+    "InferenceAPIAsyncClient",
+]
--- a/clients/python/text_generation/inference_api.py
+++ b/clients/python/text_generation/inference_api.py
@ -21,7 +21,7 @@ def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
        List[DeployedModel]: list of all currently deployed models
    """
    resp = requests.get(
-        f"https://api-inference.huggingface.co/framework/text-generation-inference",
+        "https://api-inference.huggingface.co/framework/text-generation-inference",
        headers=headers,
        timeout=5,
    )
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "2.1.2-dev0"
+    "version": "2.2.1-dev0"
  },
  "paths": {
    "/": {
@ -909,7 +909,7 @@
          "tool_choice": {
            "allOf": [
              {
-                "$ref": "#/components/schemas/ToolType"
+                "$ref": "#/components/schemas/ToolChoice"
              }
            ],
            "nullable": true
@ -1580,16 +1580,11 @@
        "type": "object",
        "required": [
          "model_id",
-          "model_dtype",
-          "model_device_type",
          "max_concurrent_requests",
          "max_best_of",
          "max_stop_sequences",
          "max_input_tokens",
          "max_total_tokens",
-          "waiting_served_ratio",
-          "max_batch_total_tokens",
-          "max_waiting_tokens",
          "validation_workers",
          "max_client_batch_size",
          "router",
@ -1601,18 +1596,6 @@
            "example": "null",
            "nullable": true
          },
-          "max_batch_size": {
-            "type": "integer",
-            "example": "null",
-            "nullable": true,
-            "minimum": 0
-          },
-          "max_batch_total_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": "32000",
-            "minimum": 0
-          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
@ -1644,19 +1627,6 @@
            "example": "2048",
            "minimum": 0
          },
-          "max_waiting_tokens": {
-            "type": "integer",
-            "example": "20",
-            "minimum": 0
-          },
-          "model_device_type": {
-            "type": "string",
-            "example": "cuda"
-          },
-          "model_dtype": {
-            "type": "string",
-            "example": "torch.float16"
-          },
          "model_id": {
            "type": "string",
            "description": "Model info",
@ -1690,11 +1660,6 @@
          "version": {
            "type": "string",
            "example": "0.5.0"
-          },
-          "waiting_served_ratio": {
-            "type": "number",
-            "format": "float",
-            "example": "1.2"
          }
        }
      },
@ -2035,6 +2000,14 @@
          }
        }
      },
+      "ToolChoice": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/ToolType"
+          }
+        ],
+        "nullable": true
+      },
      "ToolType": {
        "oneOf": [
          {
@ -2055,6 +2028,11 @@
                "$ref": "#/components/schemas/FunctionName"
              }
            }
+          },
+          {
+            "type": "object",
+            "default": null,
+            "nullable": true
          }
        ]
      },
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -21,6 +21,8 @@
    title: Messages API
  - local: architecture
    title: Internal Architecture
+  - local: usage_statistics
+    title: Usage Statistics
  title: Getting started
 - sections:
  - local: basic_tutorials/consuming_tgi
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@ -349,6 +349,12 @@ Options:
      --cors-allow-origin <CORS_ALLOW_ORIGIN>
          [env: CORS_ALLOW_ORIGIN=]

+```
+## API_KEY
+```shell
+      --api-key <API_KEY>
+          [env: API_KEY=]
+
 ```
 ## WATERMARK_GAMMA
 ```shell
@ -424,6 +430,20 @@ Options:
          
          [env: LORA_ADAPTERS=]

+```
+## USAGE_STATS
+```shell
+      --usage-stats <USAGE_STATS>
+          Control if anonymous usage stats are collected. Options are "on", "off" and "no-stack" Defaul is on
+          
+          [env: USAGE_STATS=]
+          [default: on]
+
+          Possible values:
+          - on:       Default option, usage statistics are collected anonymously
+          - off:      Disables all collection of usage statistics
+          - no-stack: Doesn't send the error stack trace or error type, but allows sending a crash event
+
 ```
 ## HELP
 ```shell
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:2.2.0-rocm \
    --model-id $model
 ```

--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:latest-intel \
+    ghcr.io/huggingface/text-generation-inference:2.2.0-intel \
    --model-id $model --cuda-graphs 0
 ```

--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.1 \
+    ghcr.io/huggingface/text-generation-inference:2.2.0 \
    --model-id $model
 ```

--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.1.1 \
+    ghcr.io/huggingface/text-generation-inference:2.2.0 \
    --model-id $model
 ```

@ -88,7 +88,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:2.1.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.2.0 --help
 ```

 </Tip>
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models on specific hardware

 ## Supported Models

+- [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
--- a/docs/source/usage_statistics.md
+++ b/docs/source/usage_statistics.md
@ -0,0 +1,75 @@
+
+# Collection of Usage Statistics
+
+Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
+
+Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
+
+## What data is collected
+
+The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs).
+As of release 2.1.2 this is an example of the data collected:
+
+- From the TGI configuration:
+```json
+{
+  "event_type": "start",
+  "disable_grammar_support": false,
+  "max_batch_prefill_tokens": 4096,
+  "max_batch_size": null,
+  "max_batch_total_tokens": null,
+  "max_best_of": 2,
+  "max_client_batch_size": 4,
+  "max_concurrent_requests": 128,
+  "max_input_tokens": 1024,
+  "max_stop_sequences": 4,
+  "max_top_n_tokens": 5,
+  "max_total_tokens": 2048,
+  "max_waiting_tokens": 20,
+  "messages_api_enabled": false,
+  "model_config": {
+    "model_type": "Bloom"
+  },
+  "revision": null,
+  "tokenizer_class": "BloomTokenizerFast",
+  "validation_workers": 2,
+  "waiting_served_ratio": 1.2,
+  "docker_label": "latest",
+  "git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe",
+  "nvidia_env": {
+    "name": "NVIDIA A10G",
+    "pci_bus_id": "00000000:00:1E.0",
+    "driver_version": "535.183.01",
+    "pstate": "P8",
+    "pcie_link_gen_max": "4",
+    "pcie_link_gen_current": "1",
+    "temperature_gpu": "31",
+    "utilization_gpu": "0 %",
+    "utilization_memory": "0 %",
+    "memory_total": "23028 MiB",
+    "memory_free": "22515 MiB",
+    "memory_used": "0 MiB",
+    "reset_status_reset_required": "No",
+    "reset_status_drain_and_reset_recommended": "No",
+    "compute_cap": "8.6",
+    "ecc_errors_corrected_volatile_total": "0",
+    "mig_mode_current": "[N/A]",
+    "power_draw_instant": "10.86 W",
+    "power_limit": "300.00 W"
+  },
+  "system_env": {
+    "cpu_count": 16,
+    "cpu_type": "AMD EPYC 7R32",
+    "total_memory": 66681196544,
+    "architecture": "x86_64",
+    "platform": "linux-unix-x86_64"
+  }
+}
+
+```
+
+## How to opt-out
+
+By passing the `--usage-stats` to the text-generation-launcher you can control how much usage statistics are being collected.
+`--usage-stats=no-stack` will not emit the stack traces from errors and the error types, but will continue to send start and stop events
+`--usage-stats=off` will completely disable everything
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -4,7 +4,6 @@ import json
 import math
 import os
 import random
-import re
 import shutil
 import subprocess
 import sys
@ -271,7 +270,7 @@ class LauncherHandle:
            try:
                await self.client.generate("test")
                return
-            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
                time.sleep(1)
        raise RuntimeError("Health check failed")

@ -333,6 +332,8 @@ def launcher(event_loop):
        max_input_length: Optional[int] = None,
        max_batch_prefill_tokens: Optional[int] = None,
        max_total_tokens: Optional[int] = None,
+        lora_adapters: Optional[List[str]] = None,
+        cuda_graphs: Optional[List[int]] = None,
    ):
        port = random.randint(8000, 10_000)
        master_port = random.randint(10_000, 20_000)
@ -379,6 +380,14 @@ def launcher(event_loop):
        if max_total_tokens:
            args.append("--max-total-tokens")
            args.append(str(max_total_tokens))
+        if lora_adapters:
+            args.append("--lora-adapters")
+            args.append(",".join(lora_adapters))
+        if cuda_graphs:
+            args.append("--cuda-graphs")
+            args.append(",".join(map(str, cuda_graphs)))
+
+        print(" ".join(args), file=sys.stderr)

        env["LOG_LEVEL"] = "info,text_generation_router=debug"

@ -418,6 +427,8 @@ def launcher(event_loop):
        max_input_length: Optional[int] = None,
        max_batch_prefill_tokens: Optional[int] = None,
        max_total_tokens: Optional[int] = None,
+        lora_adapters: Optional[List[str]] = None,
+        cuda_graphs: Optional[List[int]] = None,
    ):
        port = random.randint(8000, 10_000)

@ -447,6 +458,12 @@ def launcher(event_loop):
        if max_total_tokens:
            args.append("--max-total-tokens")
            args.append(str(max_total_tokens))
+        if lora_adapters:
+            args.append("--lora-adapters")
+            args.append(",".join(lora_adapters))
+        if cuda_graphs:
+            args.append("--cuda-graphs")
+            args.append(",".join(map(str, cuda_graphs)))

        client = docker.from_env()

--- a/integration-tests/models/snapshots/test_bloom_560m/test_bloom_560m.json
+++ b/integration-tests/models/snapshots/test_bloom_560m/test_bloom_560m.json
@ -11,52 +11,52 @@
      },
      {
        "id": 49833,
-        "logprob": -10.5625,
+        "logprob": -10.5703125,
        "text": " dég"
      },
      {
        "id": 21543,
-        "logprob": -0.14770508,
+        "logprob": -0.14746094,
        "text": "uster"
      },
      {
        "id": 447,
-        "logprob": -1.9287109,
+        "logprob": -1.9277344,
        "text": " un"
      },
      {
        "id": 46341,
-        "logprob": -15.4609375,
+        "logprob": -15.421875,
        "text": " ort"
      },
      {
        "id": 35567,
-        "logprob": -7.5585938,
+        "logprob": -7.5820312,
        "text": "olan"
      },
      {
        "id": 15,
-        "logprob": -1.4003906,
+        "logprob": -1.4013672,
        "text": ","
      },
      {
        "id": 1669,
-        "logprob": -1.5673828,
+        "logprob": -1.5664062,
        "text": " il"
      },
      {
        "id": 11580,
-        "logprob": -0.94628906,
+        "logprob": -0.94189453,
        "text": " faut"
      },
      {
        "id": 3913,
-        "logprob": -3.703125,
+        "logprob": -3.6816406,
        "text": " tout"
      },
      {
        "id": 39261,
-        "logprob": -1.5732422,
+        "logprob": -1.7753906,
        "text": " d'abord"
      }
    ],
@ -64,65 +64,66 @@
    "tokens": [
      {
        "id": 578,
-        "logprob": -1.6591797,
+        "logprob": -1.6318359,
        "special": false,
        "text": " le"
      },
      {
        "id": 5608,
-        "logprob": -2.4492188,
+        "logprob": -2.4882812,
        "special": false,
        "text": " faire"
      },
      {
-        "id": 159570,
-        "logprob": -6.6835938,
+        "id": 7735,
+        "logprob": -2.4355469,
        "special": false,
-        "text": " réch"
+        "text": " fond"
      },
      {
-        "id": 810,
+        "id": 289,
        "logprob": 0.0,
        "special": false,
-        "text": "au"
+        "text": "re"
      },
      {
-        "id": 12736,
+        "id": 693,
+        "logprob": -2.4472656,
+        "special": false,
+        "text": " à"
+      },
+      {
+        "id": 366,
+        "logprob": -1.1972656,
+        "special": false,
+        "text": " la"
+      },
+      {
+        "id": 48844,
+        "logprob": -1.7890625,
+        "special": false,
+        "text": " cass"
+      },
+      {
+        "id": 1744,
        "logprob": 0.0,
        "special": false,
-        "text": "ffer"
+        "text": "ero"
      },
      {
-        "id": 1742,
-        "logprob": -2.5175781,
-        "special": false,
-        "text": " au"
-      },
-      {
-        "id": 6105,
-        "logprob": -2.0078125,
-        "special": false,
-        "text": " bain"
-      },
-      {
-        "id": 88254,
-        "logprob": -0.12695312,
-        "special": false,
-        "text": "-mar"
-      },
-      {
-        "id": 641,
+        "id": 327,
        "logprob": 0.0,
        "special": false,
-        "text": "ie"
+        "text": "le"
      },
      {
        "id": 2940,
-        "logprob": -3.5175781,
+        "logprob": -1.9335938,
        "special": false,
        "text": " avec"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": " le faire réchauffer au bain-marie avec"
+  "generated_text": " le faire fondre à la casserole avec"
 }
--- a/integration-tests/models/snapshots/test_bloom_560m/test_bloom_560m_all_params.json
+++ b/integration-tests/models/snapshots/test_bloom_560m/test_bloom_560m_all_params.json
@ -11,7 +11,7 @@
      },
      {
        "id": 1669,
-        "logprob": -5.4414062,
+        "logprob": -5.4453125,
        "text": " il"
      },
      {
@ -21,12 +21,12 @@
      },
      {
        "id": 3913,
-        "logprob": -4.3554688,
+        "logprob": -4.3320312,
        "text": " tout"
      },
      {
        "id": 39261,
-        "logprob": -2.9238281,
+        "logprob": -2.9160156,
        "text": " d'abord"
      }
    ],
@ -34,65 +34,66 @@
    "tokens": [
      {
        "id": 408,
-        "logprob": -0.07891846,
+        "logprob": -0.16687012,
        "special": false,
        "text": " que"
      },
      {
        "id": 366,
-        "logprob": -1.2939453,
+        "logprob": -1.5517578,
        "special": false,
        "text": " la"
      },
      {
        "id": 8769,
-        "logprob": -0.3708496,
+        "logprob": -0.16687012,
        "special": false,
        "text": " personne"
      },
      {
        "id": 1479,
-        "logprob": -2.2871094,
+        "logprob": -2.1035156,
        "special": false,
        "text": " qui"
      },
      {
-        "id": 2997,
-        "logprob": -0.8671875,
+        "id": 143926,
+        "logprob": -2.8671875,
        "special": false,
-        "text": " vous"
+        "text": " réalise"
      },
      {
-        "id": 35977,
-        "logprob": -1.5097656,
+        "id": 578,
+        "logprob": 0.0,
        "special": false,
-        "text": " suit"
+        "text": " le"
      },
      {
-        "id": 21558,
-        "logprob": -0.07891846,
+        "id": 8138,
+        "logprob": -0.66748047,
        "special": false,
-        "text": " ait"
+        "text": " projet"
      },
      {
-        "id": 447,
-        "logprob": -0.12695312,
+        "id": 795,
+        "logprob": -1.6279297,
        "special": false,
-        "text": " un"
+        "text": " ne"
      },
      {
-        "id": 78606,
-        "logprob": -2.21875,
+        "id": 9802,
+        "logprob": -0.47875977,
        "special": false,
-        "text": " profil"
+        "text": " soit"
      },
      {
-        "id": 3899,
-        "logprob": -1.3535156,
+        "id": 1230,
+        "logprob": 0.0,
        "special": false,
-        "text": " bien"
+        "text": " pas"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": "Pour déguster un ortolan, il faut tout d'abord que la personne qui vous suit ait un profil bien"
+  "generated_text": "Pour déguster un ortolan, il faut tout d'abord que la personne qui réalise le projet ne soit pas"
 }
--- a/integration-tests/models/snapshots/test_bloom_560m_sharded/test_bloom_560m_sharded.json
+++ b/integration-tests/models/snapshots/test_bloom_560m_sharded/test_bloom_560m_sharded.json
@ -11,52 +11,52 @@
      },
      {
        "id": 49833,
-        "logprob": -10.5390625,
+        "logprob": -10.546875,
        "text": " dég"
      },
      {
        "id": 21543,
-        "logprob": -0.14758301,
+        "logprob": -0.14819336,
        "text": "uster"
      },
      {
        "id": 447,
-        "logprob": -1.9296875,
+        "logprob": -1.9257812,
        "text": " un"
      },
      {
        "id": 46341,
-        "logprob": -15.4453125,
+        "logprob": -15.4296875,
        "text": " ort"
      },
      {
        "id": 35567,
-        "logprob": -7.59375,
+        "logprob": -7.5625,
        "text": "olan"
      },
      {
        "id": 15,
-        "logprob": -1.3994141,
+        "logprob": -1.4199219,
        "text": ","
      },
      {
        "id": 1669,
-        "logprob": -1.578125,
+        "logprob": -1.5634766,
        "text": " il"
      },
      {
        "id": 11580,
-        "logprob": -0.9453125,
+        "logprob": -0.9458008,
        "text": " faut"
      },
      {
        "id": 3913,
-        "logprob": -3.7011719,
+        "logprob": -3.6816406,
        "text": " tout"
      },
      {
        "id": 39261,
-        "logprob": -1.5732422,
+        "logprob": -1.7753906,
        "text": " d'abord"
      }
    ],
@ -64,65 +64,66 @@
    "tokens": [
      {
        "id": 578,
-        "logprob": -1.6474609,
+        "logprob": -1.828125,
        "special": false,
        "text": " le"
      },
      {
        "id": 5608,
-        "logprob": -2.5097656,
+        "logprob": -2.5546875,
        "special": false,
        "text": " faire"
      },
      {
-        "id": 159570,
-        "logprob": -6.65625,
+        "id": 7735,
+        "logprob": -2.4277344,
        "special": false,
-        "text": " réch"
+        "text": " fond"
      },
      {
-        "id": 810,
+        "id": 289,
        "logprob": 0.0,
        "special": false,
-        "text": "au"
+        "text": "re"
      },
      {
-        "id": 12736,
+        "id": 693,
+        "logprob": -2.4472656,
+        "special": false,
+        "text": " à"
+      },
+      {
+        "id": 366,
+        "logprob": -1.1494141,
+        "special": false,
+        "text": " la"
+      },
+      {
+        "id": 48844,
+        "logprob": -1.7939453,
+        "special": false,
+        "text": " cass"
+      },
+      {
+        "id": 1744,
        "logprob": 0.0,
        "special": false,
-        "text": "ffer"
+        "text": "ero"
      },
      {
-        "id": 1742,
-        "logprob": -2.5859375,
-        "special": false,
-        "text": " au"
-      },
-      {
-        "id": 6105,
-        "logprob": -2.03125,
-        "special": false,
-        "text": " bain"
-      },
-      {
-        "id": 88254,
-        "logprob": -0.12695312,
-        "special": false,
-        "text": "-mar"
-      },
-      {
-        "id": 641,
+        "id": 327,
        "logprob": 0.0,
        "special": false,
-        "text": "ie"
+        "text": "le"
      },
      {
        "id": 2940,
-        "logprob": -3.5175781,
+        "logprob": -1.9013672,
        "special": false,
        "text": " avec"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": " le faire réchauffer au bain-marie avec"
+  "generated_text": " le faire fondre à la casserole avec"
 }
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@ -1,11 +1,17 @@
 {
  "choices": [
    {
-      "finish_reason": "eos_token",
+      "finish_reason": "stop",
      "index": 1,
      "logprobs": null,
      "text": " PR for more information?"
    },
+    {
+      "finish_reason": "length",
+      "index": 3,
+      "logprobs": null,
+      "text": "hd20220811-"
+    },
    {
      "finish_reason": "length",
      "index": 0,
@ -17,19 +23,13 @@
      "index": 2,
      "logprobs": null,
      "text": " severely flawed and often has a substandard"
-    },
-    {
-      "finish_reason": "length",
-      "index": 3,
-      "logprobs": null,
-      "text": "hd20220811-"
    }
  ],
-  "created": 1713284455,
+  "created": 1722014725,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
-  "system_fingerprint": "2.0.1-native",
+  "system_fingerprint": "2.2.1-dev0-native",
  "usage": {
    "completion_tokens": 36,
    "prompt_tokens": 8,
--- a/integration-tests/models/snapshots/test_flash_deepseek_v2/test_flash_deepseek_v2.json
+++ b/integration-tests/models/snapshots/test_flash_deepseek_v2/test_flash_deepseek_v2.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 100000,
+        "logprob": null,
+        "text": "<｜begin▁of▁sentence｜>"
+      },
+      {
+        "id": 3533,
+        "logprob": -9.625,
+        "text": "Test"
+      },
+      {
+        "id": 3102,
+        "logprob": -11.1875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 185,
+        "logprob": -1.5546875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 549,
+        "logprob": -2.84375,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 1727,
+        "logprob": -2.34375,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 3102,
+        "logprob": -0.8359375,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 317,
+        "logprob": -1.0859375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 254,
+        "logprob": -1.5390625,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 1022,
+        "logprob": -1.1875,
+        "special": false,
+        "text": " first"
+      },
+      {
+        "id": 3458,
+        "logprob": -0.35546875,
+        "special": false,
+        "text": " step"
+      },
+      {
+        "id": 279,
+        "logprob": -0.8828125,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 254,
+        "logprob": -0.71484375,
+        "special": false,
+        "text": " the"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nThe test request is the first step in the"
+}
--- a/integration-tests/models/snapshots/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json
@ -0,0 +1,53 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 4,
+    "prefill": [
+      {
+        "id": 100000,
+        "logprob": null,
+        "text": "<｜begin▁of▁sentence｜>"
+      },
+      {
+        "id": 3533,
+        "logprob": -9.625,
+        "text": "Test"
+      },
+      {
+        "id": 3102,
+        "logprob": -11.25,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 2143,
+        "logprob": -1.828125,
+        "special": false,
+        "text": " sent"
+      },
+      {
+        "id": 10081,
+        "logprob": -0.41210938,
+        "special": false,
+        "text": " successfully"
+      },
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 100001,
+        "logprob": -0.16015625,
+        "special": true,
+        "text": "<｜end▁of▁sentence｜>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request sent successfully."
+}
--- a/integration-tests/models/snapshots/test_flash_deepseek_v2/test_flash_deepseek_v2_load.json
+++ b/integration-tests/models/snapshots/test_flash_deepseek_v2/test_flash_deepseek_v2_load.json
@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.359375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83203125,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 245,
+          "logprob": -1.5703125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 3412,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " document"
+        },
+        {
+          "id": 344,
+          "logprob": -1.125,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 317,
+          "logprob": -1.6953125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1222,
+          "logprob": -1.75,
+          "special": false,
+          "text": " used"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is a document that is used"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.359375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83203125,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 245,
+          "logprob": -1.5703125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 3412,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " document"
+        },
+        {
+          "id": 344,
+          "logprob": -1.125,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 317,
+          "logprob": -1.6953125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1222,
+          "logprob": -1.75,
+          "special": false,
+          "text": " used"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is a document that is used"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.359375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83203125,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 245,
+          "logprob": -1.5703125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 3412,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " document"
+        },
+        {
+          "id": 344,
+          "logprob": -1.125,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 317,
+          "logprob": -1.6953125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1222,
+          "logprob": -1.75,
+          "special": false,
+          "text": " used"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is a document that is used"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 100000,
+          "logprob": null,
+          "text": "<｜begin▁of▁sentence｜>"
+        },
+        {
+          "id": 3533,
+          "logprob": -9.625,
+          "text": "Test"
+        },
+        {
+          "id": 3102,
+          "logprob": -11.25,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 185,
+          "logprob": -1.546875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 549,
+          "logprob": -2.859375,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 1727,
+          "logprob": -2.359375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3102,
+          "logprob": -0.83203125,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 317,
+          "logprob": -1.125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 245,
+          "logprob": -1.5703125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 3412,
+          "logprob": -2.578125,
+          "special": false,
+          "text": " document"
+        },
+        {
+          "id": 344,
+          "logprob": -1.125,
+          "special": false,
+          "text": " that"
+        },
+        {
+          "id": 317,
+          "logprob": -1.6953125,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1222,
+          "logprob": -1.75,
+          "special": false,
+          "text": " used"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nThe test request is a document that is used"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma.json
@ -11,12 +11,12 @@
      },
      {
        "id": 2015,
-        "logprob": -10.0,
+        "logprob": -10.0625,
        "text": "Test"
      },
      {
        "id": 3853,
-        "logprob": -10.875,
+        "logprob": -11.0,
        "text": " request"
      }
    ],
@ -24,7 +24,7 @@
    "tokens": [
      {
        "id": 1736,
-        "logprob": -2.09375,
+        "logprob": -2.03125,
        "special": false,
        "text": " form"
      },
@ -42,48 +42,48 @@
      },
      {
        "id": 2121,
-        "logprob": -1.8203125,
+        "logprob": -1.8125,
        "special": false,
        "text": " test"
      },
      {
        "id": 3853,
-        "logprob": -0.23242188,
+        "logprob": -0.24121094,
        "special": false,
        "text": " request"
      },
      {
        "id": 1736,
-        "logprob": -0.08544922,
+        "logprob": -0.100097656,
        "special": false,
        "text": " form"
      },
      {
        "id": 603,
-        "logprob": -0.9375,
+        "logprob": -0.9453125,
        "special": false,
        "text": " is"
      },
      {
-        "id": 1671,
-        "logprob": -1.671875,
+        "id": 476,
+        "logprob": -1.703125,
        "special": false,
-        "text": " used"
+        "text": " a"
      },
      {
-        "id": 577,
-        "logprob": -0.40429688,
+        "id": 4551,
+        "logprob": -2.453125,
        "special": false,
-        "text": " to"
+        "text": " document"
      },
      {
-        "id": 3853,
-        "logprob": -1.1875,
+        "id": 674,
+        "logprob": -0.796875,
        "special": false,
-        "text": " request"
+        "text": " that"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": " form\n\nThe test request form is used to request"
+  "generated_text": " form\n\nThe test request form is a document that"
 }
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_all_params.json
@ -11,12 +11,12 @@
      },
      {
        "id": 2015,
-        "logprob": -10.0,
+        "logprob": -10.0625,
        "text": "Test"
      },
      {
        "id": 3853,
-        "logprob": -10.875,
+        "logprob": -11.0,
        "text": " request"
      }
    ],
@ -24,7 +24,7 @@
    "tokens": [
      {
        "id": 7539,
-        "logprob": -0.73046875,
+        "logprob": -0.609375,
        "special": false,
        "text": " forms"
      },
@ -36,7 +36,7 @@
      },
      {
        "id": 671,
-        "logprob": -1.703125,
+        "logprob": -1.5546875,
        "special": false,
        "text": " an"
      },
@ -66,24 +66,24 @@
      },
      {
        "id": 11859,
-        "logprob": -1.6953125,
+        "logprob": -1.953125,
        "special": false,
        "text": " lab"
      },
      {
        "id": 2185,
-        "logprob": -1.3125,
+        "logprob": -1.7734375,
        "special": false,
        "text": " process"
      },
      {
-        "id": 578,
-        "logprob": -1.5,
+        "id": 235265,
+        "logprob": 0.0,
        "special": false,
-        "text": " and"
+        "text": "."
      }
    ],
    "top_tokens": null
  },
-  "generated_text": "Test request forms are an essential part of the lab process and"
+  "generated_text": "Test request forms are an essential part of the lab process."
 }
--- a/integration-tests/models/snapshots/test_flash_gemma2/test_flash_gemma2.json
+++ b/integration-tests/models/snapshots/test_flash_gemma2/test_flash_gemma2.json
@ -0,0 +1,254 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 106,
+        "logprob": -47.25,
+        "text": "<start_of_turn>"
+      },
+      {
+        "id": 1645,
+        "logprob": -18.875,
+        "text": "user"
+      },
+      {
+        "id": 235292,
+        "logprob": -7.15625,
+        "text": ":"
+      },
+      {
+        "id": 108,
+        "logprob": -4.78125,
+        "text": "\n"
+      },
+      {
+        "id": 5559,
+        "logprob": -10.0,
+        "text": "Write"
+      },
+      {
+        "id": 476,
+        "logprob": -0.1171875,
+        "text": " a"
+      },
+      {
+        "id": 19592,
+        "logprob": -2.46875,
+        "text": " poem"
+      },
+      {
+        "id": 577,
+        "logprob": -5.84375,
+        "text": " to"
+      },
+      {
+        "id": 1707,
+        "logprob": -6.375,
+        "text": " help"
+      },
+      {
+        "id": 682,
+        "logprob": -2.125,
+        "text": " me"
+      },
+      {
+        "id": 5434,
+        "logprob": -1.546875,
+        "text": " remember"
+      },
+      {
+        "id": 573,
+        "logprob": -0.62890625,
+        "text": " the"
+      },
+      {
+        "id": 1370,
+        "logprob": -6.65625,
+        "text": " first"
+      },
+      {
+        "id": 235248,
+        "logprob": -1.84375,
+        "text": " "
+      },
+      {
+        "id": 235274,
+        "logprob": -0.45117188,
+        "text": "1"
+      },
+      {
+        "id": 235276,
+        "logprob": -0.07421875,
+        "text": "0"
+      },
+      {
+        "id": 6635,
+        "logprob": -2.109375,
+        "text": " elements"
+      },
+      {
+        "id": 611,
+        "logprob": -0.4140625,
+        "text": " on"
+      },
+      {
+        "id": 573,
+        "logprob": -0.0009536743,
+        "text": " the"
+      },
+      {
+        "id": 26163,
+        "logprob": -0.033203125,
+        "text": " periodic"
+      },
+      {
+        "id": 3037,
+        "logprob": -0.0002670288,
+        "text": " table"
+      },
+      {
+        "id": 235269,
+        "logprob": -4.75,
+        "text": ","
+      },
+      {
+        "id": 7385,
+        "logprob": -11.625,
+        "text": " giving"
+      },
+      {
+        "id": 1853,
+        "logprob": -4.875,
+        "text": " each"
+      },
+      {
+        "id": 5356,
+        "logprob": -0.38867188,
+        "text": " element"
+      },
+      {
+        "id": 1277,
+        "logprob": -3.65625,
+        "text": " its"
+      },
+      {
+        "id": 1997,
+        "logprob": -4.4375,
+        "text": " own"
+      },
+      {
+        "id": 2017,
+        "logprob": -0.29882812,
+        "text": " line"
+      },
+      {
+        "id": 235265,
+        "logprob": -0.16699219,
+        "text": "."
+      },
+      {
+        "id": 107,
+        "logprob": -25.625,
+        "text": "<end_of_turn>"
+      },
+      {
+        "id": 108,
+        "logprob": -6.75,
+        "text": "\n"
+      },
+      {
+        "id": 106,
+        "logprob": -39.5,
+        "text": "<start_of_turn>"
+      },
+      {
+        "id": 2516,
+        "logprob": -32.5,
+        "text": "model"
+      },
+      {
+        "id": 235292,
+        "logprob": -10.125,
+        "text": ":"
+      },
+      {
+        "id": 108,
+        "logprob": -3.421875,
+        "text": "\n"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 688,
+        "logprob": -0.546875,
+        "special": false,
+        "text": "**"
+      },
+      {
+        "id": 103889,
+        "logprob": -0.49023438,
+        "special": false,
+        "text": "Hydrogen"
+      },
+      {
+        "id": 190213,
+        "logprob": -0.48632812,
+        "special": false,
+        "text": "**,"
+      },
+      {
+        "id": 2611,
+        "logprob": -0.58203125,
+        "special": false,
+        "text": " light"
+      },
+      {
+        "id": 578,
+        "logprob": -0.099121094,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 2223,
+        "logprob": -1.078125,
+        "special": false,
+        "text": " free"
+      },
+      {
+        "id": 235269,
+        "logprob": -0.025756836,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 108,
+        "logprob": -0.29101562,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 688,
+        "logprob": -0.0035858154,
+        "special": false,
+        "text": "**"
+      },
+      {
+        "id": 1949,
+        "logprob": -4.1007996e-05,
+        "special": false,
+        "text": "He"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "**Hydrogen**, light and free,\n**He"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma2/test_flash_gemma2_load.json
+++ b/integration-tests/models/snapshots/test_flash_gemma2/test_flash_gemma2_load.json
--- a/integration-tests/models/snapshots/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
@ -11,12 +11,12 @@
      },
      {
        "id": 2015,
-        "logprob": -9.65625,
+        "logprob": -9.640625,
        "text": "Test"
      },
      {
        "id": 3853,
-        "logprob": -10.3671875,
+        "logprob": -10.375,
        "text": " request"
      }
    ],
@ -24,66 +24,66 @@
    "tokens": [
      {
        "id": 604,
-        "logprob": -0.36938477,
+        "logprob": -0.2824707,
        "special": false,
        "text": " for"
      },
      {
-        "id": 235248,
-        "logprob": -1.8046875,
+        "id": 573,
+        "logprob": -0.19030762,
        "special": false,
-        "text": " "
+        "text": " the"
      },
      {
-        "id": 235274,
-        "logprob": -0.46240234,
+        "id": 16819,
+        "logprob": -1.4892578,
        "special": false,
-        "text": "1"
+        "text": " detection"
      },
      {
-        "id": 235284,
-        "logprob": -1.7460938,
+        "id": 576,
+        "logprob": -0.7011719,
        "special": false,
-        "text": "2"
+        "text": " of"
      },
      {
-        "id": 235265,
-        "logprob": -1.9443359,
+        "id": 573,
+        "logprob": -2.0195312,
        "special": false,
-        "text": "."
+        "text": " the"
      },
      {
-        "id": 235284,
-        "logprob": -1.4550781,
-        "special": false,
-        "text": "2"
-      },
-      {
-        "id": 235308,
-        "logprob": -1.0205078,
-        "special": false,
-        "text": "5"
-      },
-      {
-        "id": 235290,
-        "logprob": -1.0283203,
-        "special": false,
-        "text": "-"
-      },
-      {
-        "id": 235274,
-        "logprob": -1.2783203,
-        "special": false,
-        "text": "1"
-      },
-      {
-        "id": 235284,
+        "id": 8566,
        "logprob": 0.0,
        "special": false,
-        "text": "2"
+        "text": " presence"
+      },
+      {
+        "id": 689,
+        "logprob": -0.16491699,
+        "special": false,
+        "text": " or"
+      },
+      {
+        "id": 14862,
+        "logprob": 0.0,
+        "special": false,
+        "text": " absence"
+      },
+      {
+        "id": 576,
+        "logprob": -0.9946289,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 671,
+        "logprob": -0.5263672,
+        "special": false,
+        "text": " an"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": "Test request for 12.25-12"
+  "generated_text": "Test request for the detection of the presence or absence of an"
 }
--- a/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 2323,
+        "logprob": -9.421875,
+        "text": "Test"
+      },
+      {
+        "id": 1715,
+        "logprob": -10.546875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 369,
+        "logprob": -2.1816406,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 279,
+        "logprob": -2.6992188,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -3.6308594,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 679,
+        "logprob": -1.7900391,
+        "special": false,
+        "text": "201"
+      },
+      {
+        "id": 24,
+        "logprob": -1.3554688,
+        "special": false,
+        "text": "9"
+      },
+      {
+        "id": 12,
+        "logprob": -2.0039062,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 2366,
+        "logprob": -0.4489746,
+        "special": false,
+        "text": "202"
+      },
+      {
+        "id": 15,
+        "logprob": -0.037109375,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 2978,
+        "logprob": -0.8100586,
+        "special": false,
+        "text": " school"
+      },
+      {
+        "id": 1060,
+        "logprob": -0.013015747,
+        "special": false,
+        "text": " year"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " for the 2019-2020 school year"
+}
--- a/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 128000,
+        "logprob": null,
+        "text": "<|begin_of_text|>"
+      },
+      {
+        "id": 2323,
+        "logprob": -9.5625,
+        "text": "Test"
+      },
+      {
+        "id": 1715,
+        "logprob": -10.375,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 25,
+        "logprob": -0.8984375,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 2209,
+        "logprob": -2.78125,
+        "special": false,
+        "text": " Is"
+      },
+      {
+        "id": 279,
+        "logprob": -0.6328125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 734,
+        "logprob": -2.703125,
+        "special": false,
+        "text": " function"
+      },
+      {
+        "id": 330,
+        "logprob": -0.34179688,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 4110,
+        "logprob": -2.359375,
+        "special": false,
+        "text": "Create"
+      },
+      {
+        "id": 7575,
+        "logprob": -2.1875,
+        "special": false,
+        "text": "Process"
+      },
+      {
+        "id": 1,
+        "logprob": -0.07910156,
+        "special": false,
+        "text": "\""
+      },
+      {
+        "id": 304,
+        "logprob": -0.83203125,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 12468,
+        "logprob": -1.8203125,
+        "special": false,
+        "text": " Win"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request: Is the function \"CreateProcess\" in Win"
+}
--- a/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_load.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_load.json
@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.421875,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.546875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.1816406,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.6992188,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.6308594,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.7988281,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.3535156,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.0058594,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.45410156,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.037109375,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.8095703,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.013053894,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.421875,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.546875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.1816406,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.6992188,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.6308594,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.7988281,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.3535156,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.0058594,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.45410156,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.037109375,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.8095703,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.013053894,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.421875,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.546875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.1816406,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.6992188,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.6308594,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.7988281,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.3535156,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.0058594,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.45410156,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.037109375,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.8095703,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.013053894,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 128000,
+          "logprob": null,
+          "text": "<|begin_of_text|>"
+        },
+        {
+          "id": 2323,
+          "logprob": -9.421875,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -10.546875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 369,
+          "logprob": -2.1816406,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 279,
+          "logprob": -2.6992188,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 220,
+          "logprob": -3.6308594,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 679,
+          "logprob": -1.7988281,
+          "special": false,
+          "text": "201"
+        },
+        {
+          "id": 24,
+          "logprob": -1.3535156,
+          "special": false,
+          "text": "9"
+        },
+        {
+          "id": 12,
+          "logprob": -2.0058594,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2366,
+          "logprob": -0.45410156,
+          "special": false,
+          "text": "202"
+        },
+        {
+          "id": 15,
+          "logprob": -0.037109375,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 2978,
+          "logprob": -0.8095703,
+          "special": false,
+          "text": " school"
+        },
+        {
+          "id": 1060,
+          "logprob": -0.013053894,
+          "special": false,
+          "text": " year"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the 2019-2020 school year"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_llama_marlin_24/test_flash_llama_marlin.json
+++ b/integration-tests/models/snapshots/test_flash_llama_marlin_24/test_flash_llama_marlin.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.0859375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -16.359375,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 5229,
+        "logprob": -2.7988281,
+        "special": false,
+        "text": " failed"
+      },
+      {
+        "id": 29901,
+        "logprob": -0.91259766,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 853,
+        "logprob": -2.8496094,
+        "special": false,
+        "text": " Un"
+      },
+      {
+        "id": 23765,
+        "logprob": -1.1894531,
+        "special": false,
+        "text": "supported"
+      },
+      {
+        "id": 4714,
+        "logprob": -1.5917969,
+        "special": false,
+        "text": " browser"
+      },
+      {
+        "id": 29892,
+        "logprob": -0.34765625,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1873,
+        "logprob": -1.2695312,
+        "special": false,
+        "text": " version"
+      },
+      {
+        "id": 470,
+        "logprob": -0.25170898,
+        "special": false,
+        "text": " or"
+      },
+      {
+        "id": 7481,
+        "logprob": -0.21411133,
+        "special": false,
+        "text": " platform"
+      },
+      {
+        "id": 13,
+        "logprob": -1.1162109,
+        "special": false,
+        "text": "\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " failed: Unsupported browser, version or platform\n"
+}
--- a/integration-tests/models/snapshots/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.0859375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -16.359375,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5229,
+        "logprob": -0.6645508,
+        "special": false,
+        "text": " failed"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 6527,
+        "logprob": -2.2324219,
+        "special": false,
+        "text": " Could"
+      },
+      {
+        "id": 451,
+        "logprob": 0.0,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 6088,
+        "logprob": -1.6074219,
+        "special": false,
+        "text": " parse"
+      },
+      {
+        "id": 1243,
+        "logprob": -1.6298828,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 1206,
+        "logprob": -0.72558594,
+        "special": false,
+        "text": " case"
+      },
+      {
+        "id": 1024,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " name"
+      },
+      {
+        "id": 515,
+        "logprob": 0.0,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 525,
+        "logprob": -1.2519531,
+        "special": false,
+        "text": " '"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request failed: Could not parse test case name from '"
+}
--- a/integration-tests/models/snapshots/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json
+++ b/integration-tests/models/snapshots/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json
@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.0859375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -16.359375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 5229,
+          "logprob": -2.7988281,
+          "special": false,
+          "text": " failed"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.91259766,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 853,
+          "logprob": -2.8496094,
+          "special": false,
+          "text": " Un"
+        },
+        {
+          "id": 23765,
+          "logprob": -1.1894531,
+          "special": false,
+          "text": "supported"
+        },
+        {
+          "id": 4714,
+          "logprob": -1.5917969,
+          "special": false,
+          "text": " browser"
+        },
+        {
+          "id": 29892,
+          "logprob": -0.34765625,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 1873,
+          "logprob": -1.2695312,
+          "special": false,
+          "text": " version"
+        },
+        {
+          "id": 470,
+          "logprob": -0.25170898,
+          "special": false,
+          "text": " or"
+        },
+        {
+          "id": 7481,
+          "logprob": -0.21411133,
+          "special": false,
+          "text": " platform"
+        },
+        {
+          "id": 13,
+          "logprob": -1.1162109,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " failed: Unsupported browser, version or platform\n"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
+++ b/integration-tests/models/snapshots/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
@ -8,49 +8,49 @@
    "tokens": [
      {
        "id": 2502,
-        "logprob": -1.734375,
+        "logprob": -1.7890625,
        "special": false,
        "text": "image"
      },
      {
        "id": 2196,
-        "logprob": -0.5756836,
+        "logprob": -0.53125,
        "special": false,
        "text": " result"
      },
      {
        "id": 604,
-        "logprob": -0.007843018,
+        "logprob": -0.0077209473,
        "special": false,
        "text": " for"
      },
      {
        "id": 12254,
-        "logprob": -1.7167969,
+        "logprob": -1.703125,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 611,
-        "logprob": -0.17053223,
+        "logprob": -0.21582031,
        "special": false,
        "text": " on"
      },
      {
        "id": 573,
-        "logprob": -0.7626953,
+        "logprob": -0.734375,
        "special": false,
        "text": " the"
      },
      {
        "id": 8318,
-        "logprob": -0.02709961,
+        "logprob": -0.026000977,
        "special": false,
        "text": " beach"
      },
      {
        "id": 1,
-        "logprob": -0.20739746,
+        "logprob": -0.2109375,
        "special": true,
        "text": "<eos>"
      }
--- a/integration-tests/models/snapshots/test_flash_starcoder/test_flash_starcoder_default_params.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder/test_flash_starcoder_default_params.json
@ -11,17 +11,17 @@
      },
      {
        "id": 1459,
-        "logprob": -5.6328125,
+        "logprob": -5.625,
        "text": " print"
      },
      {
        "id": 81,
-        "logprob": -1.6035156,
+        "logprob": -1.6064453,
        "text": "_"
      },
      {
        "id": 7656,
-        "logprob": -5.9882812,
+        "logprob": -5.9921875,
        "text": "hello"
      }
    ],
@ -29,7 +29,7 @@
    "tokens": [
      {
        "id": 2262,
-        "logprob": -0.042999268,
+        "logprob": -0.045715332,
        "special": false,
        "text": "():"
      },
@ -59,7 +59,7 @@
      },
      {
        "id": 10896,
-        "logprob": -0.38549805,
+        "logprob": -0.3659668,
        "special": false,
        "text": " World"
      },
@ -113,7 +113,7 @@
      },
      {
        "id": 426,
-        "logprob": 0.0,
+        "logprob": -0.051635742,
        "special": false,
        "text": "name"
      },
@ -323,7 +323,7 @@
      },
      {
        "id": 313,
-        "logprob": -0.6328125,
+        "logprob": -0.6933594,
        "special": false,
        "text": " \""
      },
@ -387,7 +387,8 @@
        "special": false,
        "text": " print"
      }
-    ]
+    ],
+    "top_tokens": null
  },
  "generated_text": "():\n    print(\"Hello World\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name)\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \" \" + str(age))\n\ndef print"
 }
--- a/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2_default_params.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2_default_params.json
@ -11,12 +11,12 @@
      },
      {
        "id": 1489,
-        "logprob": -5.2617188,
+        "logprob": -5.265625,
        "text": " print"
      },
      {
        "id": 100,
-        "logprob": -0.38476562,
+        "logprob": -0.38549805,
        "text": "_"
      },
      {
@ -29,7 +29,7 @@
    "tokens": [
      {
        "id": 2284,
-        "logprob": -0.296875,
+        "logprob": -0.31323242,
        "special": false,
        "text": "():"
      },
@ -53,19 +53,19 @@
      },
      {
        "id": 8302,
-        "logprob": -0.28125,
+        "logprob": -0.26611328,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10914,
-        "logprob": -0.79248047,
+        "logprob": -0.7817383,
        "special": false,
        "text": " World"
      },
      {
        "id": 16013,
-        "logprob": -0.61816406,
+        "logprob": -0.6328125,
        "special": false,
        "text": "!\")"
      },
@ -83,7 +83,7 @@
      },
      {
        "id": 610,
-        "logprob": -0.4091797,
+        "logprob": -0.4086914,
        "special": false,
        "text": "def"
      },
@ -113,7 +113,7 @@
      },
      {
        "id": 444,
-        "logprob": -0.21655273,
+        "logprob": -0.21826172,
        "special": false,
        "text": "name"
      },
@ -160,16 +160,28 @@
        "text": "Hello"
      },
      {
-        "id": 332,
-        "logprob": -0.034698486,
+        "id": 925,
+        "logprob": -3.3476562,
        "special": false,
-        "text": " \""
+        "text": " %"
      },
      {
-        "id": 494,
+        "id": 120,
        "logprob": 0.0,
        "special": false,
-        "text": " +"
+        "text": "s"
+      },
+      {
+        "id": 11571,
+        "logprob": -0.10021973,
+        "special": false,
+        "text": "!\""
+      },
+      {
+        "id": 925,
+        "logprob": 0.0,
+        "special": false,
+        "text": " %"
      },
      {
        "id": 655,
@ -178,22 +190,10 @@
        "text": " name"
      },
      {
-        "id": 494,
-        "logprob": -0.20141602,
-        "special": false,
-        "text": " +"
-      },
-      {
-        "id": 332,
+        "id": 46,
        "logprob": 0.0,
        "special": false,
-        "text": " \""
-      },
-      {
-        "id": 16013,
-        "logprob": 0.0,
-        "special": false,
-        "text": "!\")"
+        "text": ")"
      },
      {
        "id": 222,
@ -251,7 +251,7 @@
      },
      {
        "id": 400,
-        "logprob": 0.0,
+        "logprob": -0.074279785,
        "special": false,
        "text": "age"
      },
@ -310,34 +310,22 @@
        "text": "Hello"
      },
      {
-        "id": 332,
+        "id": 925,
        "logprob": 0.0,
        "special": false,
-        "text": " \""
+        "text": " %"
      },
      {
-        "id": 494,
+        "id": 120,
        "logprob": 0.0,
        "special": false,
-        "text": " +"
+        "text": "s"
      },
      {
-        "id": 655,
-        "logprob": 0.0,
+        "id": 49,
+        "logprob": -0.07891846,
        "special": false,
-        "text": " name"
-      },
-      {
-        "id": 494,
-        "logprob": 0.0,
-        "special": false,
-        "text": " +"
-      },
-      {
-        "id": 3021,
-        "logprob": -0.5761719,
-        "special": false,
-        "text": " \","
+        "text": ","
      },
      {
        "id": 863,
@ -352,43 +340,55 @@
        "text": " are"
      },
      {
-        "id": 332,
+        "id": 925,
        "logprob": 0.0,
        "special": false,
-        "text": " \""
+        "text": " %"
      },
      {
-        "id": 494,
+        "id": 105,
        "logprob": 0.0,
        "special": false,
-        "text": " +"
+        "text": "d"
      },
      {
-        "id": 615,
+        "id": 11339,
        "logprob": 0.0,
        "special": false,
-        "text": " str"
+        "text": " years"
      },
      {
-        "id": 45,
+        "id": 3627,
        "logprob": 0.0,
        "special": false,
-        "text": "("
+        "text": " old"
      },
      {
-        "id": 400,
+        "id": 11571,
        "logprob": 0.0,
        "special": false,
-        "text": "age"
+        "text": "!\""
      },
      {
-        "id": 46,
+        "id": 925,
        "logprob": 0.0,
        "special": false,
-        "text": ")"
+        "text": " %"
+      },
+      {
+        "id": 327,
+        "logprob": 0.0,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name + \"!\")\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \", you are \" + str(age)"
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello %s!\" % name)\n\ndef print_hello_name_age(name, age):\n    print(\"Hello %s, you are %d years old!\" % (name"
 }
--- a/integration-tests/models/snapshots/test_idefics2/test_flash_idefics2_next_all_params.json
+++ b/integration-tests/models/snapshots/test_idefics2/test_flash_idefics2_next_all_params.json
@ -36,13 +36,13 @@
      },
      {
        "id": 633,
-        "logprob": -0.09301758,
+        "logprob": -0.09161377,
        "special": false,
        "text": " new"
      },
      {
        "id": 4480,
-        "logprob": -0.3322754,
+        "logprob": -0.26171875,
        "special": false,
        "text": " feature"
      },
--- a/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
+++ b/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
@ -0,0 +1,251 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.27416992,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.17016602,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -2.7109375,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.5,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.34204102,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 459,
+        "logprob": -1.6914062,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 1864,
+        "logprob": -0.69140625,
+        "special": false,
+        "text": " sure"
+      },
+      {
+        "id": 513,
+        "logprob": -1.6171875,
+        "special": false,
+        "text": " if"
+      },
+      {
+        "id": 315,
+        "logprob": -1.3837891,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 541,
+        "logprob": -1.2226562,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 1567,
+        "logprob": -1.8652344,
+        "special": false,
+        "text": " come"
+      },
+      {
+        "id": 582,
+        "logprob": -0.0070228577,
+        "special": false,
+        "text": " up"
+      },
+      {
+        "id": 395,
+        "logprob": -0.0054092407,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.62597656,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.0035572052,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 4842,
+        "logprob": -0.93603516,
+        "special": false,
+        "text": " unique"
+      },
+      {
+        "id": 3085,
+        "logprob": -0.028411865,
+        "special": false,
+        "text": " words"
+      },
+      {
+        "id": 369,
+        "logprob": -1.0400391,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 6685,
+        "logprob": -0.09710693,
+        "special": false,
+        "text": " describe"
+      },
+      {
+        "id": 528,
+        "logprob": -0.066467285,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 28725,
+        "logprob": -1.0722656,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 562,
+        "logprob": -0.33422852,
+        "special": false,
+        "text": " but"
+      },
+      {
+        "id": 315,
+        "logprob": -0.5136719,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -0.8989258,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 584,
+        "logprob": -0.2076416,
+        "special": false,
+        "text": "ll"
+      },
+      {
+        "id": 1464,
+        "logprob": -0.8808594,
+        "special": false,
+        "text": " try"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.88427734,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": -0.91064453,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.08105469,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28740,
+        "logprob": -1.8486328,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.111572266,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 23626,
+        "logprob": -3.15625,
+        "special": false,
+        "text": " Creative"
+      },
+      {
+        "id": 13,
+        "logprob": -0.9194336,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28750,
+        "logprob": -0.24841309,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 28723,
+        "logprob": -9.393692e-05,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 6785,
+        "logprob": -3.1386719,
+        "special": false,
+        "text": " Fun"
+      },
+      {
+        "id": 1780,
+        "logprob": -0.53564453,
+        "special": false,
+        "text": "ny"
+      },
+      {
+        "id": 13,
+        "logprob": -0.09033203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28770,
+        "logprob": -0.00466156,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.00016450882,
+        "special": false,
+        "text": "."
+      }
+    ]
+  },
+  "generated_text": "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3."
+}
--- a/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
+++ b/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
@ -0,0 +1,53 @@
+{
+  "details": {
+    "finish_reason": "eos_token",
+    "generated_tokens": 7,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1,
+        "logprob": -0.49658203,
+        "special": true,
+        "text": "<s>"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.0016384125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1,
+        "logprob": -1.4931641,
+        "special": true,
+        "text": "<s>"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.00075769424,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -0.25024414,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28740,
+        "logprob": -0.2631836,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 2,
+        "logprob": -0.0003285408,
+        "special": true,
+        "text": "</s>"
+      }
+    ]
+  },
+  "generated_text": "  11"
+}
--- a/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_without_adapter.json
+++ b/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_without_adapter.json
@ -0,0 +1,251 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0800781,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -2.1152344,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -1.6748047,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -0.097229004,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.16467285,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 7615,
+        "logprob": -2.2246094,
+        "special": false,
+        "text": " News"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.69189453,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.013343811,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28750,
+        "logprob": -0.011230469,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.00096845627,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 21095,
+        "logprob": -2.5605469,
+        "special": false,
+        "text": " Blog"
+      },
+      {
+        "id": 13,
+        "logprob": -0.19458008,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.031280518,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.0030708313,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.0029277802,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0012350082,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 20108,
+        "logprob": -2.1582031,
+        "special": false,
+        "text": " Article"
+      },
+      {
+        "id": 13,
+        "logprob": -0.05810547,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.35083008,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.034332275,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28781,
+        "logprob": -0.009666443,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0013113022,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 8349,
+        "logprob": -2.6191406,
+        "special": false,
+        "text": " Review"
+      },
+      {
+        "id": 13,
+        "logprob": -0.04031372,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 27332,
+        "logprob": -0.45239258,
+        "special": false,
+        "text": "###"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.045410156,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28782,
+        "logprob": -0.0041236877,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.0010223389,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 5299,
+        "logprob": -2.8066406,
+        "special": false,
+        "text": " Other"
+      },
+      {
+        "id": 13,
+        "logprob": -0.12054443,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.44580078,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.4921875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.3574219,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.0039062,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.5859375,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.43481445,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.2783203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.20410156,
+        "special": false,
+        "text": "\n"
+      }
+    ]
+  },
+  "generated_text": "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n"
+}
--- a/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
+++ b/integration-tests/models/snapshots/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
@ -0,0 +1,251 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 40,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.31347656,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.27441406,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -2.2285156,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.4677734,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.31762695,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 264,
+        "logprob": -1.6865234,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1215,
+        "logprob": -3.2695312,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 20640,
+        "logprob": -3.1230469,
+        "special": false,
+        "text": " passionate"
+      },
+      {
+        "id": 1338,
+        "logprob": -0.48339844,
+        "special": false,
+        "text": " person"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.9970703,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 315,
+        "logprob": -0.5498047,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -1.1923828,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.080444336,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 1215,
+        "logprob": -1.8271484,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 12215,
+        "logprob": -2.8847656,
+        "special": false,
+        "text": " driven"
+      },
+      {
+        "id": 28723,
+        "logprob": -1.0927734,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 315,
+        "logprob": -0.4584961,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 28809,
+        "logprob": -0.5019531,
+        "special": false,
+        "text": "’"
+      },
+      {
+        "id": 28719,
+        "logprob": -0.030715942,
+        "special": false,
+        "text": "m"
+      },
+      {
+        "id": 1215,
+        "logprob": -0.96972656,
+        "special": false,
+        "text": " very"
+      },
+      {
+        "id": 7798,
+        "logprob": -2.8847656,
+        "special": false,
+        "text": " determined"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.27319336,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": -0.56396484,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.011016846,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3195,
+        "logprob": -0.7163086,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 349,
+        "logprob": -1.1611328,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 574,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " your"
+      },
+      {
+        "id": 6656,
+        "logprob": -1.0253906,
+        "special": false,
+        "text": " favorite"
+      },
+      {
+        "id": 1970,
+        "logprob": -2.1738281,
+        "special": false,
+        "text": " thing"
+      },
+      {
+        "id": 684,
+        "logprob": -0.48364258,
+        "special": false,
+        "text": " about"
+      },
+      {
+        "id": 1250,
+        "logprob": -1.8876953,
+        "special": false,
+        "text": " being"
+      },
+      {
+        "id": 264,
+        "logprob": -0.41967773,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8626,
+        "logprob": -2.9160156,
+        "special": false,
+        "text": " teacher"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.11920166,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -0.023727417,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.010848999,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 28737,
+        "logprob": -1.0566406,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 2016,
+        "logprob": -0.7163086,
+        "special": false,
+        "text": " love"
+      },
+      {
+        "id": 272,
+        "logprob": -1.9169922,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 1639,
+        "logprob": -2.03125,
+        "special": false,
+        "text": " fact"
+      }
+    ]
+  },
+  "generated_text": "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact"
+}
--- a/Show More
+++ b/Show More