Merge branch 'main' into hot_fix_xpu

2025-09-12 12:54:52 +00:00 · 2024-08-06 21:57:25 -07:00 · 2024-08-06 21:57:25 -07:00 · 30e70f2ceb
commit 30e70f2ceb
parent 3d21c8f43a 133015f408
109 changed files with 5433 additions and 1838 deletions
--- a/.devcontainer/Dockerfile.trtllm
+++ b/.devcontainer/Dockerfile.trtllm
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
--- a/.dockerignore
+++ b/.dockerignore
@ -2,3 +2,5 @@ aml
 target
 server/transformers
 server/flash-attention
+cmake-build-debug/
+cmake-build-release/
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@ -28,7 +28,7 @@ jobs:

    - name: Install router
      id: install-router
-      run: cargo install --path router/
+      run: cargo install --path backends/v3/

    - uses: actions/setup-node@v4
      with:
@ -41,5 +41,5 @@ jobs:

    - name: Check that documentation is up-to-date
      run: |
-        npm install -g swagger-cli
+        npm install -g @redocly/cli
        python update_doc.py --check
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@ -10,6 +10,7 @@ on:
    paths:
      - ".github/workflows/build.yaml"
      - "integration-tests/**"
+      - "backends/**"
      - "server/**"
      - "proto/**"
      - "router/**"
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,10 @@ target
 router/tokenizer.json
 *__pycache__*

+backends/v3/src/client/pb
+backends/client/src/v2/pb
+backends/client/src/v3/pb
+
 # ROCm auto-generated files
 *.hip
 server/exllamav2_kernels/exllamav2_kernels/hip/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -13,8 +13,8 @@ repos:
 -   repo: https://github.com/doublify/pre-commit-rust
    rev: v1.0
    hooks:
-    -   id: fmt
    -   id: cargo-check
+    -   id: fmt
    -   id: clippy
 -   repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.3.0
--- a/.redocly.lint-ignore.yaml
+++ b/.redocly.lint-ignore.yaml
@ -0,0 +1,79 @@
+# This file instructs Redocly's linter to ignore the rules contained for specific parts of your API.
+# See https://redoc.ly/docs/cli/ for more information.
+docs/openapi.json:
+  no-empty-servers:
+    - '#/openapi'
+  spec:
+    - >-
+      #/components/schemas/GenerateParameters/properties/best_of/exclusiveMinimum
+    - >-
+      #/components/schemas/GenerateParameters/properties/frequency_penalty/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/grammar/nullable'
+    - >-
+      #/components/schemas/GenerateParameters/properties/repetition_penalty/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/seed/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/temperature/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/top_k/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/top_n_tokens/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/top_p/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/typical_p/exclusiveMinimum
+    - '#/components/schemas/GenerateResponse/properties/details/nullable'
+    - '#/components/schemas/StreamResponse/properties/details/nullable'
+    - '#/components/schemas/ChatRequest/properties/response_format/nullable'
+    - '#/components/schemas/ChatRequest/properties/tool_choice/nullable'
+    - '#/components/schemas/ToolChoice/nullable'
+    - '#/components/schemas/ChatCompletionComplete/properties/logprobs/nullable'
+    - '#/components/schemas/ChatCompletionChoice/properties/logprobs/nullable'
+  no-invalid-media-type-examples:
+    - '#/paths/~1/post/responses/422/content/application~1json/example'
+    - '#/paths/~1/post/responses/424/content/application~1json/example'
+    - '#/paths/~1/post/responses/429/content/application~1json/example'
+    - '#/paths/~1/post/responses/500/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/422/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/424/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/429/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/500/content/application~1json/example'
+    - >-
+      #/paths/~1generate_stream/post/responses/422/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/424/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/429/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/500/content/text~1event-stream/example
+    - '#/paths/~1tokenize/post/responses/404/content/application~1json/example'
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/422/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/424/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/429/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/500/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/422/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/424/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/429/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/500/content/application~1json/example
+  operation-4xx-response:
+    - '#/paths/~1health/get/responses'
+    - '#/paths/~1info/get/responses'
+    - '#/paths/~1metrics/get/responses'
+  no-unused-components:
+    - '#/components/schemas/Completion'
+  security-defined:
+    - '#/paths/~1/post'
+    - '#/paths/~1generate/post'
+    - '#/paths/~1generate_stream/post'
+    - '#/paths/~1health/get'
+    - '#/paths/~1info/get'
+    - '#/paths/~1metrics/get'
+    - '#/paths/~1tokenize/post'
+    - '#/paths/~1v1~1chat~1completions/post'
+    - '#/paths/~1v1~1completions/post'
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,9 +1,18 @@
 [workspace]
 members = [
  "benchmark",
-    "router",
-    "router/client",
-    "router/grpc-metadata",
+  "backends/v3",
+  "backends/grpc-metadata",
+  "backends/trtllm",
+  "backends/client",
+  "launcher"
+]
+default-members = [
+  "benchmark",
+  "backends/v3",
+  "backends/grpc-metadata",
+  # "backends/trtllm",
+  "backends/client",
  "launcher"
 ]
 resolver = "2"
@ -18,6 +27,8 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 base64 = "0.22.0"
 tokenizers = { version = "0.19.1", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }

 [profile.release]
 incremental = true
--- a/2
+++ b/2
@ -11,6 +11,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json

@ -33,6 +34,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt

--- a/Dockerfile.trtllm
+++ b/Dockerfile.trtllm
@ -0,0 +1,23 @@
+# All the tooling for CUDA
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+
+WORKDIR /usr/src/tgi/backends/trtllm
+RUN apt update && apt install -y cmake git git-lfs gcc g++ ninja-build libopenmpi-dev python3-dev python3-pip wget
+
+COPY . /usr/src/tgi
+RUN chmod +x scripts/install_tensorrt.sh && scripts/install_tensorrt.sh
+RUN cmake -G Ninja -B build -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include .
+RUN cmake --build build --parallel -t tgi_trtllm_backend_impl
+
+# All the tooling for Rust
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+# Include CUDA related libraries and tools to the Rust based image
+COPY --from=cuda-builder /usr/local/cuda /usr/local/cuda
+COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=cuda-builder /usr/src/tgi/backends/trtllm/build /usr/local/tgi/trtllm/build
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH
+
+RUN apt update && apt install -y cmake git gcc g++ ninja-build libopenmpi3
--- a/2
+++ b/2
@ -11,6 +11,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json

@ -33,6 +34,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt

--- a/2
+++ b/2
@ -12,6 +12,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo chef prepare --recipe-path recipe.json

@ -34,6 +35,7 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt

--- a/6
+++ b/6
@ -5,13 +5,13 @@ install-server-cpu:
 	cd server && make install-server

 install-router:
-	cd router && cargo install --path .
+	cargo install --path backends/v3/

 install-launcher:
-	cd launcher && cargo install --path .
+	cargo install --path launcher/

 install-benchmark:
-	cd benchmark && cargo install --path .
+	cargo install --path benchmark/

 install: install-server install-router install-launcher

--- a/backends/client/Cargo.toml
+++ b/backends/client/Cargo.toml
--- a/backends/client/build.rs
+++ b/backends/client/build.rs
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
--- a/backends/client/src/v2/client.rs
+++ b/backends/client/src/v2/client.rs
--- a/backends/client/src/v2/mod.rs
+++ b/backends/client/src/v2/mod.rs
--- a/backends/client/src/v2/sharded_client.rs
+++ b/backends/client/src/v2/sharded_client.rs
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
--- a/backends/client/src/v3/mod.rs
+++ b/backends/client/src/v3/mod.rs
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
--- a/backends/grpc-metadata/Cargo.toml
+++ b/backends/grpc-metadata/Cargo.toml
--- a/backends/grpc-metadata/src/lib.rs
+++ b/backends/grpc-metadata/src/lib.rs
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(tgi-trtllm-backend VERSION 1.0.0)
+set(CMAKE_CXX_STANDARD 20)
+
+include(FetchContent)
+include(ExternalProject)
+
+option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
+option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
+
+# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
+find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+
+#### External dependencies ####
+include(cmake/fmt.cmake)
+include(cmake/json.cmake)
+include(cmake/spdlog.cmake)
+include(cmake/trtllm.cmake)
+
+# Let's build TRTLLM as part of CMake
+add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
+
+# Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
+set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
+
+# TGI TRTLLM Backend definition
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
+include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+target_include_directories(tgi_trtllm_backend_impl PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:include>
+)
+target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
+
+# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
+install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+
+#### Unit Tests ####
+if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
+    message(STATUS "Building tests")
+    FetchContent_Declare(
+            Catch2
+            GIT_REPOSITORY https://github.com/catchorg/Catch2
+            GIT_TAG v3.6.0
+    )
+    FetchContent_MakeAvailable(Catch2)
+
+    #    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
+    #    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
+
+    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+    include(CTest)
+    include(Catch)
+    #    catch_discover_tests(tgi_trtllm_backend_tests)
+endif ()
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@ -0,0 +1,26 @@
+[package]
+name = "text-generation-backends-trtllm"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
+async-trait = "0.1"
+async-stream = "0.3"
+cxx = "1.0"
+text-generation-router = { path = "../../router" }
+tokenizers = { version = "0.19", features = ["hf-hub"] }
+tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.15"
+clap = { version = "4.5", features = ["derive"] }
+thiserror = "1.0.62"
+tracing = "0.1"
+tracing-opentelemetry = "0.24"
+tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+log = { version = "0.4", features = [] }
+
+[build-dependencies]
+cmake = "0.1"
+cxx-build = { version = "1.0", features = ["parallel"] }
+pkg-config = "0.3"
--- a/backends/trtllm/Dockerfile
+++ b/backends/trtllm/Dockerfile
@ -0,0 +1,100 @@
+ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
+ARG OMPI_VERSION="4.1.6"
+
+# Build dependencies resolver stage
+FROM lukemathwalker/cargo-chef:latest AS chef
+WORKDIR /usr/src/text-generation-inference
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+# CUDA dependent dependencies resolver stage
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && apt install -y \
+    build-essential \
+    cmake \
+    curl \
+    gcc  \
+    g++ \
+    git \
+    git-lfs \
+    libssl-dev \
+    ninja-build \
+    pkg-config \
+    python3 \
+    python3-setuptools \
+    tar \
+    wget
+
+ENV TGI_INSTALL_PREFIX=/usr/local/tgi
+ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
+
+# Install OpenMPI
+FROM cuda-builder AS mpi-builder
+ARG OMPI_VERSION
+
+ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
+RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
+    mkdir /usr/src/mpi && \
+    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
+    cd /usr/src/mpi && \
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --without-slurm && \
+    make -j all && \
+    make install && \
+    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
+
+# Install TensorRT
+FROM cuda-builder AS trt-builder
+COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
+RUN chmod +x /opt/install_tensorrt.sh && \
+    /opt/install_tensorrt.sh
+
+# Build Backend
+FROM cuda-builder AS tgi-builder
+WORKDIR /usr/src/text-generation-inference
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
+    chmod -R a+w /root/.rustup && \
+    chmod -R a+w /root/.cargo
+
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef
+
+# Cache dependencies
+COPY --from=planner /usr/src/text-generation-inference/recipe.json .
+RUN cargo chef cook --release --recipe-path recipe.json
+
+# Build actual TGI
+ARG CUDA_ARCH_LIST
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
+ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
+ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
+
+COPY . .
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
+    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release --bin text-generation-backends-trtllm
+
+FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
+WORKDIR /usr/local/tgi/bin
+
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
+
+FROM runtime
+
+LABEL co.huggingface.vendor="Hugging Face Inc."
+LABEL org.opencontainers.image.authors="hardware@hf.co"
+
+ENTRYPOINT ["./text-generation-launcher"]
+CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
--- a/backends/trtllm/README.md
+++ b/backends/trtllm/README.md
@ -0,0 +1,46 @@
+# Text Generation Inference - TensorRT-LLM Backend Implementation
+
+## Description
+
+This folder provides the sources of the TensorRT-LLM backend implementation powered by TensorRT-LLM Executor new API
+
+## Simplified Request Sequence
+
+```mermaid
+sequenceDiagram
+    actor User
+    participant TextGenerationInference.HttpServer
+    participant TextGenerationInference.TensorRtLlmBackend
+    participant TextGenerationInference.TensorRtLlmWorkerThread
+    participant TensorRtLlm.Executor
+    participant Nvidia.Gpu
+    User ->> TextGenerationInference.HttpServer: POST /generate
+    TextGenerationInference.HttpServer ->> TextGenerationInference.TensorRtLlmBackend: Validate and forward inputs & parameters
+    TextGenerationInference.TensorRtLlmBackend ->> TextGenerationInference.TensorRtLlmWorkerThread: Allocate a new context and spawn a new thread to handle the request
+    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Submit the request to the In-Flight Batcher
+    activate Nvidia.Gpu
+    TensorRtLlm.Executor ->> Nvidia.Gpu: Add the request to the poll for execution
+    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Response with an unique request identifier
+    rect rgb(10, 92, 54)
+        loop every 100us
+            rect rgb(15, 81, 50)
+                alt Acquire lock to query executor
+                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Poll request number of new token(s) generated
+                else There are new generated tokens
+                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Retrieve newly generated tokens
+                    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Return decoded token information and potential error (omitted)
+                    rect rgb(11, 110, 79)
+                        alt Generated token is final
+                            TensorRtLlm.Executor ->> Nvidia.Gpu: Remove request from the scheduler and from the GPU
+                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream the remaining decoded tokens and flush the connection
+                        else Generated token is not final
+                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream token back to the user as they get decoded
+                        end
+                    end
+                end
+            end
+            deactivate Nvidia.Gpu
+        end
+    end
+
+```
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@ -0,0 +1,150 @@
+use cxx_build::CFG;
+use pkg_config;
+use std::env;
+use std::env::consts::ARCH;
+use std::path::{absolute, PathBuf};
+
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
+const CUDA_REQUIRED_VERSION: &str = "12.5";
+const MPI_REQUIRED_VERSION: &str = "4.1";
+const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
+const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
+const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
+
+// Dependencies
+const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
+const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
+    ("dylib", "tensorrt_llm"),
+    ("static", "tensorrt_llm_executor_static"),
+    ("dylib", "tensorrt_llm_nvrtc_wrapper"),
+    ("dylib", "nvinfer_plugin_tensorrt_llm"),
+    ("dylib", "decoder_attention"),
+];
+
+macro_rules! probe {
+    ($name: expr, $version: expr) => {
+        if let Err(_) = pkg_config::probe_library($name) {
+            pkg_config::probe_library(&format!("{}-{}", $name, $version))
+                .expect(&format!("Failed to locate {}", $name));
+        }
+    };
+}
+
+fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
+    // Build the backend implementation through CMake
+    let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
+    let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
+
+    let mut install_path = PathBuf::from(install_path);
+    if !install_path.is_absolute() {
+        install_path = absolute(out_dir).expect("cannot happen").join(install_path);
+    }
+
+    let _ = cmake::Config::new(".")
+        .uses_cxx11()
+        .generator("Ninja")
+        .profile(match is_debug {
+            true => "Debug",
+            false => "Release",
+        })
+        .env("OPT_LEVEL", opt_level)
+        .define("CMAKE_INSTALL_PREFIX", &install_path)
+        .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
+        .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
+        .build();
+
+    // Additional transitive CMake dependencies
+    let deps_folder = out_dir.join("build").join("_deps");
+    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
+        let dep_name = match is_debug {
+            true => format!("{}d", dependency),
+            false => String::from(dependency),
+        };
+        let dep_path = deps_folder.join(format!("{}-build", dependency));
+        println!("cargo:rustc-link-search={}", dep_path.display());
+        println!("cargo:rustc-link-lib=static={}", dep_name);
+    }
+
+    // Emit linkage information from the artifacts we just built
+    let install_lib_path = install_path.join("lib");
+
+    println!(
+        r"cargo:warning=Adding link search path: {}",
+        install_lib_path.display()
+    );
+    println!(r"cargo:rustc-link-search={}", install_lib_path.display());
+
+    (PathBuf::from(install_path), deps_folder)
+}
+
+fn build_ffi_layer(deps_folder: &PathBuf) {
+    CFG.include_prefix = "backends/trtllm";
+    cxx_build::bridge("src/lib.rs")
+        .static_flag(true)
+        .include(deps_folder.join("fmt-src").join("include"))
+        .include(deps_folder.join("spdlog-src").join("include"))
+        .include(deps_folder.join("json-src").join("include"))
+        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
+        .include("/usr/local/cuda/include")
+        .include("/usr/local/tensorrt/include")
+        .file("src/ffi.cpp")
+        .std("c++20")
+        .compile("tgi_trtllm_backend");
+
+    println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=include/backend.h");
+    println!("cargo:rerun-if-changed=lib/backend.cpp");
+    println!("cargo:rerun-if-changed=include/ffi.h");
+    println!("cargo:rerun-if-changed=src/ffi.cpp");
+}
+
+fn main() {
+    // Misc variables
+    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let build_profile = env::var("PROFILE").unwrap();
+    let (is_debug, opt_level) = match build_profile.as_ref() {
+        "debug" => (true, "0"),
+        _ => (false, "3"),
+    };
+
+    // Build the backend
+    let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
+
+    // Build the FFI layer calling the backend above
+    build_ffi_layer(&deps_folder);
+
+    // Emit linkage search path
+    probe!("ompi", MPI_REQUIRED_VERSION);
+
+    // Probe CUDA & co. with pkg-config
+    CUDA_TRANSITIVE_DEPS.iter().for_each(|name| {
+        probe!(name, CUDA_REQUIRED_VERSION);
+    });
+
+    // NCCL is slightly trickier because it might not have a pkgconfig installed
+    let nccl_library_path_default = format!("/usr/local/{}-linux-gnu", ARCH);
+    let nccl_library_path = NCCL_ROOT_DIR.unwrap_or(&nccl_library_path_default);
+    println!(r"cargo:rustc-link-search=native={}", nccl_library_path);
+    println!("cargo:rustc-link-lib=dylib=nccl");
+
+    // TensorRT
+    let tensort_library_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt/lib");
+    println!(r"cargo:rustc-link-search=native={}", tensort_library_path);
+    println!("cargo:rustc-link-lib=dylib=nvinfer");
+
+    // TensorRT-LLM
+    TENSORRT_LLM_TRANSITIVE_DEPS
+        .iter()
+        .for_each(|(link_type, name)| {
+            println!("cargo:rustc-link-lib={}={}", link_type, name);
+        });
+
+    // Backend
+    BACKEND_DEPS.iter().for_each(|name| {
+        println!("cargo:rustc-link-lib=static={}", name);
+    });
+}
--- a/backends/trtllm/cmake/fmt.cmake
+++ b/backends/trtllm/cmake/fmt.cmake
@ -0,0 +1,6 @@
+FetchContent_Declare(
+        fmt
+        GIT_REPOSITORY https://github.com/fmtlib/fmt
+        GIT_TAG 11.0.1
+)
+FetchContent_MakeAvailable(fmt)
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@ -0,0 +1,5 @@
+fetchcontent_declare(
+        json
+        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+)
+fetchcontent_makeavailable(json)
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@ -0,0 +1,17 @@
+set(SPDLOG_USE_FMT ON)
+set(SPDLOG_BUILD_SHARED OFF)
+set(SPDLOG_FMT_EXTERNAL ON)
+
+# Define the level at which SPDLOG_ compilation level is defined
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+else ()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+endif ()
+
+fetchcontent_declare(
+        spdlog
+        GIT_REPOSITORY https://github.com/gabime/spdlog.git
+        GIT_TAG v1.14.1
+)
+fetchcontent_makeavailable(spdlog)
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@ -0,0 +1,42 @@
+set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
+
+set(USE_CXX11_ABI ON)
+set(BUILD_PYT OFF)
+set(BUILD_PYBIND OFF)
+set(BUILD_MICRO_BENCHMARKS OFF)
+set(BUILD_BENCHMARKS OFF)
+set(BUILD_TESTS OFF)
+set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
+
+message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    set(FAST_BUILD ON)
+    set(NVTX_DISABLE OFF)
+else ()
+    set(FAST_BUILD OFF)
+    set(FAST_MATH ON)
+    set(NVTX_DISABLE ON)
+endif ()
+
+fetchcontent_declare(
+        trtllm
+        GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
+        GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
+        GIT_SHALLOW FALSE
+)
+fetchcontent_makeavailable(trtllm)
+
+message(STATUS "Found TensorRT-LLM: ${trtllm_SOURCE_DIR}")
+execute_process(COMMAND git lfs install WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+execute_process(COMMAND git lfs pull WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+
+# TRTLLM use a JIT based *precompiled* library to generate some specific kernels, we are generating the path to this one here
+set(TRTLLM_NVRTC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_nvrtc_wrapper${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE INTERNAL "nvrtc wrapper library name")
+set(TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_NVRTC_LIBRARY_NAME}"
+        CACHE INTERNAL "nvrtc wrapper library path")
+
+# The same Executor Static library
+set(TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_executor_static${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE INTERNAL "executor_static library name")
+set(TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/executor/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME}" CACHE INTERNAL "executor_static library path")
--- a/backends/trtllm/cmake/utils/detect_cuda_arch.cu
+++ b/backends/trtllm/cmake/utils/detect_cuda_arch.cu
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -0,0 +1,121 @@
+//
+// Created by Morgan Funtowicz on 6/30/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_H
+#define TGI_TRTLLM_BACKEND_H
+
+#include <cmath>
+#include <filesystem>
+#include <span>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+#include <tensorrt_llm/runtime/common.h>
+#include <tensorrt_llm/executor/executor.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>
+
+using json = nlohmann::json;
+namespace tle = tensorrt_llm::executor;
+
+namespace huggingface::tgi::backends {
+    using RequestId = tle::IdType;
+    using TokenId = tle::TokenIdType;
+
+    /**
+     * Initialize all the components required by TRTLLM.
+     * It is required to call this function before attempting to load any engine
+     */
+    void InitializeBackend();
+
+    /**
+     *
+     * @param config TensorRT-LLM configuration object
+     * @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
+     * @return
+     */
+    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
+
+    /**
+     * Get the sampling configuration from the parameters provided by TGI
+     * @param topK
+     * @param topP
+     * @param temperature
+     * @param repetition_penalty
+     * @param frequency_penalty
+     * @param seed
+     * @return
+     */
+    tle::SamplingConfig GetSamplingConfig(
+            uint32_t topK,
+            float_t topP,
+            float_t temperature,
+            float_t repetition_penalty,
+            float_t frequency_penalty,
+            uint64_t seed
+    );
+
+    /**
+     *
+     */
+    class TensorRtLlmBackend {
+    private:
+        const json config;
+        tle::Executor executor;
+
+    public:
+        explicit TensorRtLlmBackend(
+                const std::filesystem::path &engineFolder,
+                const std::filesystem::path &executorWorker
+        );
+
+        /**
+         * Indicate if the backend is ready to accept incoming request
+         * @return true if ready, false otherwise
+         */
+        [[nodiscard]] bool IsReady() const;
+
+        /**
+         * Query the executor for the number of token available for pulling
+         * @return
+         */
+        [[nodiscard]] size_t NumResponsesReady() const;
+
+        /**
+         * Submit a new generation task to the executor
+         * @param tokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param repetition_penalty
+         * @param frequency_penalty
+         * @param seed
+         * @return Request id related to this generation for reference
+         */
+        [[nodiscard]] RequestId Submit(
+                const std::vector<TokenId> &tokens,
+                int32_t topK,
+                float_t topP,
+                float_t temperature,
+                float_t repetition_penalty,
+                float_t frequency_penalty,
+                uint64_t seed
+        );
+
+        /**
+         *
+         * @param requestId The request id to poll the generation results
+         * @return
+         */
+        std::vector<tle::Response> Poll(RequestId requestId);
+
+        /**
+         * Stop the underlying executor
+         */
+        void Shutdown();
+    };
+}
+
+
+#endif //TGI_TRTLLM_BACKEND_H
--- a/backends/trtllm/include/ffi.h
+++ b/backends/trtllm/include/ffi.h
@ -0,0 +1,75 @@
+//
+// Created by mfuntowicz on 7/11/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_FFI_H
+#define TGI_TRTLLM_BACKEND_FFI_H
+
+#include <cstddef>
+#include "backend.h"
+
+namespace huggingface::tgi::backends {
+    class TensorRtLlmBackendImpl;
+}
+
+#include "backends/trtllm/src/lib.rs.h"
+
+
+namespace huggingface::tgi::backends {
+
+//    struct GenerationContext;
+
+    class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
+    public:
+        /***
+         *
+         * @param engineFolder
+         * @param executorWorker
+         */
+        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
+
+        /***
+         *
+         * @return
+         */
+        bool IsReady() const;
+
+        /***
+         *
+         * @param tokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param repetition_penalty
+         * @param frequency_penalty
+         * @param seed
+         * @return
+         */
+        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
+        uint64_t
+        Submit(rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature,
+               float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
+
+        /***
+         *
+         * @param requestId
+         * @param ctx
+         * @param callback
+         * @return
+         */
+        size_t StreamTokens(
+                const RequestId requestId,
+                huggingface::tgi::backends::GenerationContext *ctx,
+                rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
+                              huggingface::tgi::backends::GenerationStep)> callback);
+    };
+
+    /***
+    *
+    * @param engineFolder
+    * @return
+    */
+    std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
+}
+
+#endif //TGI_TRTLLM_BACKEND_FFI_H
--- a/backends/trtllm/include/hardware.h
+++ b/backends/trtllm/include/hardware.h
@ -0,0 +1,59 @@
+//
+// Created by mfuntowicz on 7/23/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
+#define TGI_TRTLLM_BACKEND_HARDWARE_H
+
+#include <cstdint>
+#include <limits>
+#include <fmt/base.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+namespace huggingface::hardware::cuda {
+
+#define AMPERE_SM_MAJOR 8
+#define HOPPER_SM_MAJOR 8
+
+    /**
+     * Store information about the version of the CUDA Compute Capabilities detected on the device
+     */
+    struct CudaComputeCapabilities {
+        int32_t major;
+        int32_t minor;
+
+        [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
+
+        [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
+    };
+
+    CudaComputeCapabilities GetCudaComputeCapabilities() {
+        // Get the compute capabilities of the current hardware
+        nvmlDevice_t device;
+        CudaComputeCapabilities capabilities{0, 0};
+        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
+            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
+            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
+                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
+            }
+        }
+
+        return capabilities;
+    }
+
+    /**
+     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
+     * @return
+     */
+    std::optional<size_t> GetNumDevices() {
+        uint32_t numGpus = 0;
+        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
+            return std::optional(numGpus);
+        } else {
+            return std::nullopt;
+        }
+    }
+}
+
+#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -0,0 +1,146 @@
+#include <fstream>
+
+#include <fmt/ranges.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+#include "backend.h"
+#include "hardware.h"
+
+void huggingface::tgi::backends::InitializeBackend() {
+    SPDLOG_INFO("Initializing Backend...");
+    nvmlInit_v2();
+    initTrtLlmPlugins();
+
+    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
+    if (numGpus.has_value()) {
+        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
+    } else {
+        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
+    }
+}
+
+[[nodiscard]]
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
+    tle::ExecutorConfig execConfig(1);
+
+    // Retrieve the compute capabilities to enable some options at runtime
+    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
+
+    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
+    if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+        execConfig.setParallelConfig(tle::ParallelConfig(
+                tle::CommunicationType::kMPI,
+                tle::CommunicationMode::kLEADER,
+                std::nullopt,
+                std::nullopt,
+                std::nullopt
+        ));
+    } else { // Multiple engines -> using orchestrator mode (MPI involved)
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+        execConfig.setParallelConfig(tle::ParallelConfig(
+                tle::CommunicationType::kMPI,
+                tle::CommunicationMode::kORCHESTRATOR,
+                std::nullopt,
+                std::nullopt,
+                tle::OrchestratorConfig(true, workerPath, nullptr, true)
+        ));
+    }
+
+    // Define some configuration variables
+    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
+    execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
+    return execConfig;
+}
+
+tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
+        uint32_t topK,
+        float_t topP,
+        float_t temperature,
+        float_t repetition_penalty,
+        float_t frequency_penalty,
+        uint64_t seed) {
+    return tle::SamplingConfig(
+            1,  // TGI only use a single beam
+            topK,
+            topP,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            seed,
+            temperature,
+            temperature,
+            std::nullopt,
+            repetition_penalty,
+            std::nullopt,
+            frequency_penalty
+    );
+}
+
+huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
+        const std::filesystem::path &enginesFolder,
+        const std::filesystem::path &executorWorker
+) :
+        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
+        executor(
+                enginesFolder,
+                tensorrt_llm::executor::ModelType::kDECODER_ONLY,
+                GetExecutorConfig(config, executorWorker.string()
+                )) {
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string &>());
+}
+
+bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
+    return executor.canEnqueueRequests();
+}
+
+[[nodiscard("Returned number of requests needs to be consumed")]]
+size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
+    return executor.getNumResponsesReady();
+}
+
+[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
+tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
+        const std::vector<tle::TokenIdType> &tokens,
+        const int32_t topK,
+        const float_t topP,
+        const float_t temperature,
+        const float_t repetition_penalty,
+        const float_t frequency_penalty,
+        const uint64_t seed
+) {
+#ifdef NDEBUG
+    SPDLOG_DEBUG(
+            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
+            tokens.size(),
+            executor.getLatestIterationStats().back().numActiveRequests
+    );
+#else
+    SPDLOG_DEBUG(
+            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
+            fmt::join(tokens, ", "),
+            executor.getLatestIterationStats().front().numActiveRequests
+    );
+#endif
+
+    const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<size_t>();
+    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
+
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+    const auto output = tle::OutputConfig(true, false, false, true, false);
+    return executor.enqueueRequest(
+            tle::Request{tokens, maxNewTokens, true, sampling, output});
+}
+
+[[nodiscard("Generated tokens result must be used")]]
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
+    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
+    return executor.awaitResponses(requestId);
+}
+
+
+void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() {
+    SPDLOG_INFO("Shutting down executor");
+    executor.shutdown();
+}
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@ -0,0 +1,111 @@
+#!/bin/bash
+
+set -ex
+
+TRT_VER="10.2.0.19"
+CUDA_VER="12.5"
+CUDNN_VER="9.2.1.18-1"
+NCCL_VER="2.22.3-1+cuda12.5"
+CUBLAS_VER="12.5.3.2-1"
+NVRTC_VER="12.5.82-1"
+
+for i in "$@"; do
+    case $i in
+        --TRT_VER=?*) TRT_VER="${i#*=}";;
+        --CUDA_VER=?*) CUDA_VER="${i#*=}";;
+        --CUDNN_VER=?*) CUDNN_VER="${i#*=}";;
+        --NCCL_VER=?*) NCCL_VER="${i#*=}";;
+        --CUBLAS_VER=?*) CUBLAS_VER="${i#*=}";;
+        *) ;;
+    esac
+    shift
+done
+
+NVCC_VERSION_OUTPUT=$(nvcc --version)
+if [[ $(echo $NVCC_VERSION_OUTPUT | grep -oP "\d+\.\d+" | head -n 1) != ${CUDA_VER} ]]; then
+  echo "The version of pre-installed CUDA is not equal to ${CUDA_VER}."
+  exit 1
+fi
+
+install_ubuntu_requirements() {
+    apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates
+    ARCH=$(uname -m)
+    if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
+    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
+    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb
+    dpkg -i cuda-keyring_1.0-1_all.deb
+
+    apt-get update
+    if [[ $(apt list --installed | grep libcudnn9) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libcudnn9*
+    fi
+    if [[ $(apt list --installed | grep libnccl) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libnccl*
+    fi
+    if [[ $(apt list --installed | grep libcublas) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages libcublas*
+    fi
+    if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
+      apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
+    fi
+    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    apt-get install -y --no-install-recommends libcudnn9-cuda-12=${CUDNN_VER} libcudnn9-dev-cuda-12=${CUDNN_VER}
+    apt-get install -y --no-install-recommends libnccl2=${NCCL_VER} libnccl-dev=${NCCL_VER}
+    apt-get install -y --no-install-recommends libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER}
+    # NVRTC static library doesn't exist in NGC PyTorch container.
+    NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+}
+
+install_centos_requirements() {
+    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    yum -y update
+    yum -y install epel-release
+    yum remove -y libnccl* && yum -y install libnccl-${NCCL_VER} libnccl-devel-${NCCL_VER}
+    yum remove -y libcublas* && yum -y install libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER} libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}
+    yum clean all
+}
+
+install_tensorrt() {
+    #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
+    #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
+    TRT_CUDA_VERSION="12.5"
+
+    if [ -z "$RELEASE_URL_TRT" ];then
+        ARCH=${TRT_TARGETARCH}
+        if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
+        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
+        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
+        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
+        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+    fi
+    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
+    tar -xf /tmp/TensorRT.tar -C /usr/local/
+    mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt
+    # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
+    rm -rf /tmp/TensorRT.tar
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  debian)
+    install_ubuntu_requirements
+    install_tensorrt
+    ;;
+  ubuntu)
+    install_ubuntu_requirements
+    install_tensorrt
+    ;;
+  centos)
+    install_centos_requirements
+    install_tensorrt
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@ -0,0 +1,329 @@
+use std::future::Future;
+use std::path::Path;
+use std::pin::{pin, Pin};
+use std::str::FromStr;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, OnceLock};
+use std::task::{Context, Poll};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use cxx::UniquePtr;
+use log::{error, warn};
+use tokenizers::Tokenizer;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
+use tokio::sync::RwLock;
+use tokio::time::{sleep, Instant};
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tokio_stream::{Stream, StreamExt};
+use tracing::{instrument, span, Level};
+
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidationError::UnsupportedModality;
+use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};
+use text_generation_router::{FinishReason, Token};
+
+use crate::errors::TensorRtLlmBackendError;
+use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+
+// Value used to poll the state of the generation stream
+static POLLING_INTERVAL_US: OnceLock<u64> = OnceLock::new();
+
+type InferResult<T> = Result<T, InferError>;
+
+pub(crate) struct Generation {
+    executor: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
+    done: Arc<AtomicBool>,
+}
+
+/// Holds the user provided input to be executed along with a channel allowing
+/// to bubble up all the generated tokens for that tokens the to end stream.
+pub struct GenerationContext {
+    sender: UnboundedSender<InferResult<InferStreamResponse>>,
+    tokenizer: Arc<Tokenizer>,
+    tokens: Vec<u32>,
+    done: Arc<AtomicBool>,
+    queued: Instant,
+    start: Option<Instant>,
+}
+
+impl Stream for Generation {
+    type Item = usize;
+
+    fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let interval = POLLING_INTERVAL_US.get_or_init(|| {
+            u64::from_str(option_env!("TRTLLM_BACKEND_POLLING_INTERVAL_US").unwrap_or("100"))
+                .expect("Invalid value provided for envvar POLLING_INTERVAL_US")
+        });
+
+        if !self.done.load(Ordering::Relaxed) {
+            let backend = pin!(self.executor.read());
+            let status = match backend.poll(ctx) {
+                Poll::Ready(executor_r) => {
+                    let ready = executor_r.num_responses_ready();
+                    if ready == 0 {
+                        Poll::Pending
+                    } else {
+                        Poll::Ready(Some(ready))
+                    }
+                }
+                Poll::Pending => Poll::Pending,
+            };
+
+            let waker = ctx.waker().clone();
+            tokio::spawn(async {
+                sleep(Duration::from_micros(*interval)).await;
+                waker.wake();
+            });
+
+            status
+        } else {
+            Poll::Ready(None) // end of stream
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (1, None)
+    }
+}
+
+unsafe impl Send for TensorRtLlmBackendImpl {}
+unsafe impl Sync for TensorRtLlmBackendImpl {}
+
+/// Implements the logic to execute generation with TensorRT-LLM executor API in background
+pub struct TensorRtLlmBackend {
+    tokenizer: Arc<Tokenizer>,
+
+    // Backing the backend behind a RwLock to allow concurrent read access to retrieve
+    // the number of available tokens (read only) in the Generation stream
+    backend: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
+}
+
+impl TensorRtLlmBackend {
+    pub fn new<P: AsRef<Path> + Send + 'static, PP: AsRef<Path> + Send + 'static>(
+        tokenizer: Tokenizer,
+        engine_folder: P,
+        executor_worker_path: PP,
+    ) -> Result<Self, TensorRtLlmBackendError> {
+        Ok(TensorRtLlmBackend {
+            tokenizer: Arc::new(tokenizer),
+            backend: Arc::new(RwLock::new(create_tensorrt_llm_backend(
+                engine_folder.as_ref().to_str().unwrap(),
+                executor_worker_path.as_ref().to_str().unwrap(),
+            ))),
+        })
+    }
+
+    fn validate(request: &ValidGenerateRequest) -> InferResult<&String> {
+        if request.top_n_tokens > 1 {
+            return Err(InferError::ValidationError(
+                ValidationError::TopNTokensDisabled,
+            ));
+        }
+
+        // TODO: Is it really needed? How can it be validated before?
+        if request.parameters.grammar.is_some() {
+            return Err(InferError::ValidationError(ValidationError::Grammar));
+        }
+
+        match request.inputs.len() {
+            0 => Err(InferError::ValidationError(ValidationError::EmptyInput)),
+            2.. => Err(InferError::GenerationError(
+                "TensorRT-LLM backend don't support multi-chunk".into(),
+            )),
+            1 => match request.inputs.first().expect("Single item-chunk") {
+                Chunk::Text(text) => Ok(text),
+                Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))),
+            },
+        }
+    }
+
+    fn generate(
+        &self,
+        sender: UnboundedSender<InferResult<InferStreamResponse>>,
+        tokens: Vec<u32>,
+        top_k: u32,
+        top_p: f32,
+        temperature: f32,
+        repetition_penalty: f32,
+        frequency_penalty: f32,
+        seed: u64,
+    ) {
+        let tokenizer = Arc::clone(&self.tokenizer);
+        let executor = Arc::clone(&self.backend);
+
+        // Let's push this in async context
+        tokio::spawn(async move {
+            // Define the generation state
+            let mut generation = Generation {
+                executor: executor.clone(),
+                done: Arc::new(AtomicBool::new(false)),
+            };
+
+            // Define the context over the generation
+            // TODO(asap): Do we really need so many shared-ownership?
+            let ctx = Box::new(GenerationContext {
+                sender: sender.clone(),
+                tokenizer,
+                tokens: vec![],
+                done: Arc::clone(&generation.done),
+                start: None,
+                queued: Instant::now(),
+            });
+
+            // We are leaking the context on-purpose to avoid the box being dropped while there are
+            // still computation ongoing
+            // TODO(asap): Can we achieve the same with an Arc<Box<T>> without the need to go unsafe?
+            let ctx_ = Box::leak(ctx);
+
+            // Submit the request to the batcher
+            let request_id = span!(Level::DEBUG, "submit")
+                .in_scope(|| async {
+                    let mut handle = executor.write().await;
+                    let request_id = handle.pin_mut().submit(
+                        &tokens,
+                        top_k as i32,
+                        top_p,
+                        temperature,
+                        repetition_penalty,
+                        frequency_penalty,
+                        seed,
+                    );
+
+                    request_id
+                })
+                .await;
+
+            while let Some(_) = generation.next().await {
+                let mut executor_w = executor.write().await;
+                let executor = executor_w.pin_mut();
+
+                span!(Level::DEBUG, "decode")
+                    .in_scope(|| async {
+                        unsafe {
+                            executor.stream_tokens(
+                                request_id,
+                                ctx_,
+                                |ctx: *mut GenerationContext, step: GenerationStep| {
+                                    let inner_ctx = &mut *ctx;
+
+                                    // Update the timestamp at which the request started effectively
+                                    // Can be a bit off, would need to be before the callback, let's see
+                                    inner_ctx.start.get_or_insert(Instant::now());
+                                    inner_ctx.done.store(step.is_final, Ordering::Relaxed);
+
+                                    // Ensure we are not running into errors
+                                    let parcel = if !step.has_error {
+                                        // Insert the latest generated token to the tracker
+                                        inner_ctx.tokens.push(step.token_id);
+
+                                        // Decode the token
+                                        let text = inner_ctx
+                                            .tokenizer
+                                            .decode(&[step.token_id], true)
+                                            .expect("Failed to decode token");
+
+                                        let special = inner_ctx
+                                            .tokenizer
+                                            .get_added_vocabulary()
+                                            .is_special_token(&text);
+
+                                        // Create the structure holding the token
+                                        let token = Token {
+                                            id: step.token_id,
+                                            text,
+                                            logprob: step.log_prob,
+                                            special,
+                                        };
+
+                                        if step.is_final {
+                                            let generated_text = inner_ctx
+                                                .tokenizer
+                                                .decode(&inner_ctx.tokens, true)
+                                                .expect("Failed to decode generated_tokens");
+
+                                            Ok(InferStreamResponse::End {
+                                                token,
+                                                top_tokens: vec![],
+                                                generated_text: GeneratedText {
+                                                    text: generated_text,
+                                                    generated_tokens: inner_ctx.tokens.len() as u32,
+                                                    finish_reason: FinishReason::EndOfSequenceToken,
+                                                    seed: None,
+                                                },
+                                                start: inner_ctx.start.unwrap_or(Instant::now()),
+                                                queued: inner_ctx.queued,
+                                            })
+                                        } else {
+                                            Ok(InferStreamResponse::Intermediate {
+                                                token,
+                                                top_tokens: vec![],
+                                            })
+                                        }
+                                    } else {
+                                        error!("Error caught while decoding: {}", &step.error_msg);
+                                        Err(InferError::GenerationError(step.error_msg))
+                                    };
+
+                                    // Send the parcel to the client
+                                    inner_ctx
+                                        .sender
+                                        .send(parcel)
+                                        .expect("Failed to sent msg through the channel");
+                                },
+                            );
+                        }
+                    })
+                    .await;
+            }
+
+            // "Properly" free the shared context...
+            // TODO: clean that piece of sh** asap
+            unsafe {
+                let _ = Box::from_raw(ctx_);
+            }
+        });
+    }
+}
+
+#[async_trait]
+impl Backend for TensorRtLlmBackend {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> InferResult<UnboundedReceiverStream<InferResult<InferStreamResponse>>> {
+        // Let's add a few more validation
+        let input = TensorRtLlmBackend::validate(&request)?;
+
+        // Channel to stream the generated token as they come from the worker thread back to the transport layer
+        let (sender, receiver) = unbounded_channel();
+
+        // Unpack parameters
+        let params = &request.parameters;
+
+        // Preprocess the inputs to send to TRTLLM backend
+        let encoding = self
+            .tokenizer
+            .encode(input.as_str(), true)
+            .map_err(|e| InferError::GenerationError(e.to_string()))?;
+
+        // Generate the response
+        self.generate(
+            sender,
+            Vec::from(encoding.get_ids()),
+            params.top_k,
+            params.top_p,
+            params.temperature,
+            params.repetition_penalty,
+            params.frequency_penalty,
+            params.seed,
+        );
+
+        Ok(UnboundedReceiverStream::new(receiver))
+    }
+
+    async fn health(&self, _current_health: bool) -> bool {
+        true
+    }
+}
--- a/backends/trtllm/src/errors.rs
+++ b/backends/trtllm/src/errors.rs
@ -0,0 +1,15 @@
+use thiserror::Error;
+
+use text_generation_router::server;
+
+#[derive(Debug, Error)]
+pub enum TensorRtLlmBackendError {
+    #[error("Tokenizer error: {0}")]
+    Tokenizer(String),
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -0,0 +1,84 @@
+//
+// Created by mfuntowicz on 6/30/24.
+//
+#pragma once
+
+#include <cmath>
+#include <exception>
+#include <filesystem>
+#include <limits>
+#include <iterator>
+#include <vector>
+
+#include <spdlog/spdlog.h>
+#include "backends/trtllm/include/ffi.h"
+
+
+huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
+        const std::string_view &engineFolder,
+        const std::string_view &executorWorker
+) : TensorRtLlmBackend(engineFolder, executorWorker) {}
+
+
+bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const {
+    return TensorRtLlmBackend::IsReady();
+}
+
+uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
+        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty,
+        float_t frequency_penalty, uint64_t seed) {
+
+    // This will copy all the items from the initial slice
+    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
+    return TensorRtLlmBackend::Submit(
+            std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+}
+
+size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
+        const uint64_t requestId,
+        huggingface::tgi::backends::GenerationContext *ctx,
+        rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
+                      huggingface::tgi::backends::GenerationStep)> callback) {
+
+    size_t numTokens = 0;
+    for (const auto &item: Poll(requestId)) {
+        GenerationStep step;
+        if (!item.hasError()) {
+            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
+            const auto decoded = item.getResult();
+
+            const auto token = decoded.outputTokenIds[0][0];
+            const auto isFinal = decoded.isFinal;
+            const auto logProb = decoded.logProbs.value()[0][0];
+
+            ++numTokens;
+
+            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
+            step = huggingface::tgi::backends::GenerationStep{
+                    static_cast<uint32_t>(token), logProb, isFinal, false, std::move(std::string())
+            };
+            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
+        } else {
+            // TODO : Return rest::Result with error
+            const auto what = item.getErrorMsg();
+            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", what);
+            step = huggingface::tgi::backends::GenerationStep{
+                    std::numeric_limits<uint32_t>::max(), 0.0, true, true, std::move(what)
+            };
+        }
+
+        callback(std::move(ctx), std::move(step));
+    }
+
+    return numTokens;
+}
+
+std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
+huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
+    // Unconditionally call this to initialize and discover TRTLLM plugins
+    InitializeBackend();
+
+    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
+    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
+    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
+}
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -0,0 +1,78 @@
+pub use backend::{GenerationContext, TensorRtLlmBackend};
+
+mod backend;
+pub mod errors;
+
+#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+mod ffi {
+
+    /// Struct used as shared type between rust and C++ to represent the result
+    /// of a single decoding iteration
+    pub struct GenerationStep {
+        token_id: u32,
+        log_prob: f32,
+        is_final: bool,
+        has_error: bool,
+        error_msg: String,
+    }
+
+    extern "Rust" {
+        type GenerationContext;
+    }
+
+    unsafe extern "C++" {
+        include!("backends/trtllm/src/ffi.cpp");
+
+        /// Represent an instance of the underlying TensorRT-LLM backend
+        type TensorRtLlmBackendImpl;
+
+        /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
+        ///
+        /// # Arguments
+        ///
+        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
+        /// * `executor_worker`: Path to the TRTLLM executor worker
+        ///
+        /// returns: <unknown>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
+        #[rust_name = "create_tensorrt_llm_backend"]
+        fn CreateTensorRtLlmBackend(
+            engine_folder: &str,
+            executor_worker: &str,
+        ) -> UniquePtr<TensorRtLlmBackendImpl>;
+
+        // #[rust_name = "is_ready"]
+        // fn IsReady(self: &TensorRtLlmBackendImpl) -> bool;
+
+        #[rust_name = "num_responses_ready"]
+        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
+
+        #[rust_name = "submit"]
+        fn Submit(
+            self: Pin<&mut TensorRtLlmBackendImpl>,
+            tokens: &[u32],
+            top_k: i32,
+            top_p: f32,
+            temperature: f32,
+            repetition_penalty: f32,
+            frequency_penalty: f32,
+            seed: u64,
+        ) -> u64;
+
+        #[rust_name = "stream_tokens"]
+        unsafe fn StreamTokens(
+            self: Pin<&mut TensorRtLlmBackendImpl>,
+            request_id: u64,
+            ctx: *mut GenerationContext,
+            cb: unsafe fn(*mut GenerationContext, GenerationStep),
+        ) -> usize;
+
+        // #[rust_name = "shutdown"]
+        // fn Shutdown(self: Pin<&mut TensorRtLlmBackendImpl>);
+    }
+}
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@ -0,0 +1,166 @@
+use std::collections::HashMap;
+use std::path::PathBuf;
+
+use clap::Parser;
+use tokenizers::{FromPretrainedParameters, Tokenizer};
+
+use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
+use text_generation_backends_trtllm::TensorRtLlmBackend;
+use text_generation_router::server;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(long, env, required = true)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(long, env)]
+    model_id: String,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env, default_value_t = false)]
+    messages_api_enabled: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(long, env)]
+    auth_token: Option<String>,
+    #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
+    executor_worker: PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), TensorRtLlmBackendError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        hostname,
+        port,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        model_id,
+        validation_workers,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        messages_api_enabled,
+        max_client_batch_size,
+        auth_token,
+        executor_worker,
+    } = args;
+
+    // Launch Tokio runtime
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    if !executor_worker.exists() {
+        return Err(TensorRtLlmBackendError::ArgumentValidation(format!(
+            "`executor_work` specified path doesn't exists: {}",
+            executor_worker.display()
+        )));
+    }
+
+    // Run server
+    let tokenizer = Tokenizer::from_pretrained(
+        tokenizer_name.clone(),
+        Some(FromPretrainedParameters {
+            revision: revision.clone().unwrap_or(String::from("main")),
+            user_agent: HashMap::new(),
+            auth_token,
+        }),
+    )
+    .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?;
+
+    let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?;
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        None,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        hostname,
+        port,
+        cors_allow_origin,
+        false,
+        None,
+        None,
+        messages_api_enabled,
+        true,
+        max_client_batch_size,
+    )
+    .await?;
+    Ok(())
+}
--- a/backends/trtllm/tests/infer_test.cpp
+++ b/backends/trtllm/tests/infer_test.cpp
@ -0,0 +1,14 @@
+//
+// Created by mfuntowicz on 7/2/24.
+//
+#include <catch2/catch_all.hpp>
+#include <spdlog/spdlog.h>
+#include "../include/backend.h"
+
+TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
+    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
+    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
+
+    spdlog::info("Loading config from: {}", absolute(engines).string());
+    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
+}
--- a/backends/v3/Cargo.toml
+++ b/backends/v3/Cargo.toml
@ -0,0 +1,66 @@
+[package]
+name = "text-generation-router-v3"
+description = "Text Generation Webserver"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "text-generation-router"
+path = "src/main.rs"
+
+[dependencies]
+async-trait = "0.1.74"
+async-stream = "0.3.5"
+axum = { version = "0.7", features = ["json"] }
+axum-tracing-opentelemetry = "0.16"
+text-generation-router = { path = "../../router" }
+clap = { version = "4.4.5", features = ["derive", "env"] }
+grpc-metadata = { path = "../grpc-metadata" }
+futures = "0.3.28"
+hf-hub = { workspace = true }
+jsonschema = { version = "0.17.1", features = ["draft202012"] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
+nohash-hasher = "0.2.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
+rand = "0.8.5"
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+thiserror = "1.0.48"
+tokenizers = { workspace = true}
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.14"
+tower-http = { version = "0.5.1", features = ["cors"] }
+tracing = "0.1.37"
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+utoipa = { version = "4.2.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
+init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
+minijinja = { version = "2.0.2" }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+futures-util = "0.3.30"
+regex = "1.10.3"
+once_cell = "1.19.0"
+image = "0.25.1"
+base64 = { workspace = true }
+prost = "^0.12"
+tonic = "^0.10"
+tower = "^0.4"
+
+[build-dependencies]
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
+
+[features]
+default = ["ngrok"]
+ngrok = ["text-generation-router/ngrok"]
+google = ["text-generation-router/google"]
+kserve = ["text-generation-router/kserve"]
--- a/backends/v3/build.rs
+++ b/backends/v3/build.rs
@ -0,0 +1,19 @@
+use std::fs;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    println!("cargo:rerun-if-changed=../../proto/");
+
+    fs::create_dir_all("src/client/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/client/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    Ok(())
+}
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -0,0 +1,508 @@
+use crate::client::{Batch, CachedBatch, ClientError, Generation, Health, ShardedClient};
+/// Batching and inference logic
+use crate::queue::{Entry, Queue};
+use async_trait::async_trait;
+use nohash_hasher::IntMap;
+use std::sync::Arc;
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidGenerateRequest;
+use text_generation_router::{FinishReason, PrefillToken, Token};
+use tokio::sync::mpsc::error::SendError;
+use tokio::sync::{mpsc, Notify};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{info_span, instrument, Instrument, Span};
+
+pub struct BackendV3 {
+    /// Request queue
+    queue: Queue,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
+    /// Client clone, used for health checks to skip the queue
+    client: ShardedClient,
+}
+
+impl BackendV3 {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        client: ShardedClient,
+        waiting_served_ratio: f32,
+        max_batch_prefill_tokens: u32,
+        max_batch_total_tokens: u32,
+        max_waiting_tokens: usize,
+        max_batch_size: Option<usize>,
+        requires_padding: bool,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
+        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
+            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
+        } else {
+            false
+        };
+        let block_size = if flashdecoding { 256 } else { 16 };
+
+        let queue = Queue::new(
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+            max_batch_total_tokens,
+        );
+        let batching_task_notifier = Arc::new(Notify::new());
+
+        // Spawn batching background task that contains all the inference logic
+        tokio::spawn(batching_task(
+            client.clone(),
+            waiting_served_ratio,
+            max_batch_prefill_tokens,
+            max_batch_total_tokens,
+            max_waiting_tokens,
+            max_batch_size,
+            queue.clone(),
+            batching_task_notifier.clone(),
+        ));
+
+        Self {
+            queue,
+            batching_task_notifier,
+            client,
+        }
+    }
+}
+
+#[async_trait]
+impl Backend for BackendV3 {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        // MPSC channel to communicate with the background batching task
+        let (response_tx, response_rx) = mpsc::unbounded_channel();
+
+        // Append the request to the queue
+        self.queue.append(Entry {
+            request,
+            response_tx,
+            span: Span::current(),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+            block_allocation: None,
+        });
+
+        // Notify the background task that we have a new entry in the queue that needs
+        // to be batched
+        self.batching_task_notifier.notify_one();
+
+        // Return stream
+        Ok(UnboundedReceiverStream::new(response_rx))
+    }
+
+    async fn health(&self, current_health: bool) -> bool {
+        if current_health {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok()
+    }
+}
+
+/// Batching logic
+/// Will be launched in a background Tokio task
+///
+/// Batches requests and sends them to the inference server
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn batching_task(
+    mut client: ShardedClient,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: u32,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+    queue: Queue,
+    notifier: Arc<Notify>,
+) {
+    // Infinite loop
+    loop {
+        // Wait for a notification from the Infer struct
+        notifier.notified().await;
+
+        // Get the next batch from the queue
+        // This batch might be smaller than the maximum batch size if there are not enough requests
+        // waiting in the queue
+        while let Some((mut entries, batch, span)) = queue
+            .next_batch(
+                None,
+                max_batch_size,
+                max_batch_prefill_tokens,
+                max_batch_total_tokens,
+            )
+            .await
+        {
+            let mut cached_batch = prefill(&mut client, batch, &mut entries)
+                .instrument(span)
+                .await;
+            let mut waiting_tokens = 1;
+
+            // We loop until we do not receive any cached batch from the inference server (== until
+            // all requests have met their stopping criteria)
+            while let Some(batch) = cached_batch {
+                // Get current batch info
+                let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
+                let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
+                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
+
+                let min_size = if waiting_tokens >= max_waiting_tokens {
+                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                    // to add a new batch even though its size might be small
+                    None
+                } else {
+                    // Minimum batch size
+                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                };
+
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
+
+                // Try to get a new batch
+                if let Some((mut new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
+                    .await
+                {
+                    // Tracking metrics
+                    if min_size.is_some() {
+                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
+                            .increment(1);
+                    } else {
+                        metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
+                            .increment(1);
+                    }
+
+                    entries.iter_mut().for_each(|(_, entry)| {
+                        // Create a new span to add the info that this entry is waiting
+                        // because a new batch is being computed
+                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                        // Add relationships
+                        span.follows_from(&entry_waiting_span);
+                        entry_waiting_span.follows_from(&span);
+                        // Update entry
+                        entry.temp_span = Some(entry_waiting_span);
+                    });
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
+                        .instrument(span)
+                        .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        entries.extend(new_entries);
+                        batches.push(new_cached_batch);
+                    }
+                }
+
+                // Create span for this batch to add context to inference calls
+                let next_batch_size = entries.len();
+                let next_batch_span =
+                    info_span!(parent: None, "batch", batch_size = next_batch_size);
+                entries.iter_mut().for_each(|(_, entry)| {
+                    // Create a new span to link the batch back to this entry
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
+                    entry_batch_span.follows_from(&next_batch_span);
+                    // Update entry
+                    entry.temp_span = Some(entry_batch_span);
+                });
+
+                cached_batch = decode(&mut client, batches, &mut entries)
+                    .instrument(next_batch_span)
+                    .await;
+                waiting_tokens += 1;
+            }
+            metrics::gauge!("tgi_batch_current_size").set(0.0);
+            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_id = batch.id;
+    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
+
+    match client.prefill(batch).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "prefill")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            let _ = client.clear_cache(Some(batch_id)).await;
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
+    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch, timings)) => {
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
+                    .record(concat_duration.as_secs_f64());
+            }
+            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
+                .record(timings.forward.as_secs_f64());
+            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
+                .record(timings.decode.as_secs_f64());
+            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
+                .record(start_filtering_time.elapsed().as_secs_f64());
+            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
+                .record(start_time.elapsed().as_secs_f64());
+            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            for id in batch_ids {
+                let _ = client.clear_cache(Some(id)).await;
+            }
+            send_errors(err, entries);
+            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
+            None
+        }
+    }
+}
+
+/// Filter a `batch` and remove all requests not present in `entries`
+#[instrument(skip_all)]
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<CachedBatch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
+    batch.request_ids.retain(|id| entries.contains_key(id));
+
+    if batch.request_ids.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.request_ids).await.unwrap()
+    }
+}
+
+/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
+/// and filter entries
+#[instrument(skip_all)]
+fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
+    generations.into_iter().for_each(|generation| {
+        let id = generation.request_id;
+        // Get entry
+        // We can `expect` here as the request id should always be in the entries
+        let entry = entries
+            .get(&id)
+            .expect("ID not found in entries. This is a bug.");
+
+        // Create and enter a span to link this function back to the entry
+        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
+        // Send generation responses back to the infer task
+        // If the receive an error from the Flume channel, it means that the client dropped the
+        // request and we need to stop generating hence why we unwrap_or(true)
+        let stopped = send_responses(generation, entry).map_err(|err| {
+            tracing::error!("Entry response channel error.");
+            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+            err
+        }).unwrap_or(true);
+        if stopped {
+            entries.remove(&id).expect("ID not found in entries. This is a bug.");
+        }
+    });
+}
+
+/// Send responses through the `entry` response channel
+fn send_responses(
+    generation: Generation,
+    entry: &Entry,
+) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
+    // Return directly if the channel is disconnected
+    if entry.response_tx.is_closed() {
+        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
+        return Ok(true);
+    }
+
+    let mut stopped = false;
+
+    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
+        // Send message
+        entry
+            .response_tx
+            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
+    }
+
+    // Create last Token
+    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
+    let n = tokens_.ids.len();
+    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
+    let mut iterator = tokens_
+        .ids
+        .into_iter()
+        .zip(tokens_.logprobs)
+        .zip(tokens_.texts)
+        .zip(tokens_.is_special)
+        .enumerate()
+        .peekable();
+    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
+        let token = Token {
+            id,
+            text,
+            logprob,
+            special,
+        };
+        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
+            top_tokens_
+                .ids
+                .iter()
+                .zip(top_tokens_.logprobs.iter())
+                .zip(top_tokens_.texts.iter())
+                .zip(top_tokens_.is_special.iter())
+                .map(|(((&id, &logprob), text), &special)| Token {
+                    id,
+                    text: text.to_string(),
+                    logprob,
+                    special,
+                })
+                .collect()
+        } else {
+            vec![]
+        };
+        match (&generation.generated_text, iterator.peek()) {
+            (Some(generated_text), None) => {
+                // Generation has ended
+                stopped = true;
+                // Send message
+                entry.response_tx.send(Ok(InferStreamResponse::End {
+                    token,
+                    top_tokens,
+                    generated_text: GeneratedText::from(generated_text.clone()),
+                    queued: entry.queue_time,
+                    start: entry.batch_time.unwrap(),
+                }))?;
+            }
+            _ => {
+                // Send message
+                entry
+                    .response_tx
+                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
+            }
+        }
+    }
+
+    Ok(stopped)
+}
+
+/// Send errors to Infer for all `entries`
+#[instrument(skip_all)]
+fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
+    entries.drain().for_each(|(_, entry)| {
+        // Create and enter a span to link this function back to the entry
+        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
+        let err = InferError::GenerationError(error.to_string());
+        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
+        tracing::error!("{err}");
+
+        // unwrap_or is valid here as we don't care if the receiver is gone.
+        entry
+            .response_tx
+            .send(Err(err))
+            .unwrap_or(());
+    });
+}
+
+impl From<crate::client::GeneratedText> for GeneratedText {
+    fn from(value: crate::client::GeneratedText) -> Self {
+        let v3_finish_reason = crate::client::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v3_finish_reason {
+            crate::client::FinishReason::Length => FinishReason::Length,
+            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
+        };
+
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
+        }
+    }
+}
--- a/router/src/infer/v3/block_allocator.rs
+++ b/router/src/infer/v3/block_allocator.rs
--- a/backends/v3/src/client/grpc_client.rs
+++ b/backends/v3/src/client/grpc_client.rs
@ -0,0 +1,284 @@
+/// Single shard Client
+use crate::client::{pb, Chunk};
+use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
+use base64::engine::general_purpose::STANDARD;
+use base64::Engine;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut input_chunks = Vec::new();
+            input_chunks
+                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
+            if n_tokens == 0 {
+                input_chunks.push(
+                    Chunk::Image(Image {
+                        // Safe unwrap, because we control the data.
+                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
+                        mimetype: "image/jpeg;base64".to_string(),
+                    })
+                    .into(),
+                );
+            }
+
+            // Send stringly-typed inputs for compatibility for backends that haven't
+            // been updated to support chunks.
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                input_chunks: Some(Input {
+                    chunks: input_chunks,
+                }),
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Blocks and slots will be set on the server side if we use paged attention
+                blocks: vec![],
+                slots: vec![],
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+                adapter_id: None,
+            });
+            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: max_input_length,
+            max_blocks: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@ -0,0 +1,76 @@
+//! Text Generation gRPC client library
+
+use async_trait::async_trait;
+use thiserror::Error;
+use tonic::transport;
+use tonic::Status;
+
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod grpc_client;
+mod sharded_client;
+
+pub use grpc_client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters,
+};
+pub use sharded_client::ShardedClient;
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
+
+#[derive(Error, Debug, Clone)]
+pub enum ClientError {
+    #[error("Could not connect to Text Generation server: {0}")]
+    Connection(String),
+    #[error("Server error: {0}")]
+    Generation(String),
+    #[error("Sharded results are empty")]
+    EmptyResults,
+}
+
+impl From<Status> for ClientError {
+    fn from(err: Status) -> Self {
+        let err = Self::Generation(err.message().to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+impl From<transport::Error> for ClientError {
+    fn from(err: transport::Error) -> Self {
+        let err = Self::Connection(err.to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+// Small convenience re-wrapping of `Chunk`.
+impl From<Chunk> for InputChunk {
+    fn from(chunk: Chunk) -> Self {
+        InputChunk { chunk: Some(chunk) }
+    }
+}
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
+pub type Result<T> = std::result::Result<T, ClientError>;
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -0,0 +1,260 @@
+use crate::client::{ClientError, Result};
+/// Multi shard Client
+use crate::client::{Health, ShardInfo};
+
+use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
+use crate::client::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use crate::client::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    #[allow(dead_code)]
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+            // Block 0 is reserved for health checks
+            blocks: vec![0],
+            slots: (0..16).collect(),
+            adapter_id: None,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+            max_blocks: 1,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
--- a/backends/v3/src/lib.rs
+++ b/backends/v3/src/lib.rs
@ -0,0 +1,142 @@
+mod backend;
+mod block_allocator;
+mod client;
+mod queue;
+
+use crate::client::{ClientError, ShardedClient};
+pub(crate) use backend::BackendV3;
+use serde::Serialize;
+use thiserror::Error;
+use utoipa::ToSchema;
+
+#[derive(Clone, Debug, Serialize, ToSchema)]
+pub struct BackendInfo {
+    /// Mandatory
+    #[schema(example = "cuda")]
+    pub model_device_type: String,
+    #[schema(example = "torch.float16")]
+    pub model_dtype: String,
+
+    /// Backend parameters
+    #[schema(example = "1")]
+    pub speculate: usize,
+    #[schema(example = "1.2")]
+    pub waiting_served_ratio: f32,
+    #[schema(example = "32000")]
+    pub max_batch_total_tokens: u32,
+    #[schema(example = "20")]
+    pub max_waiting_tokens: usize,
+    #[schema(nullable = true, example = "null")]
+    pub max_batch_size: Option<usize>,
+}
+
+#[allow(clippy::too_many_arguments)]
+pub async fn connect_backend(
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    master_shard_uds_path: String,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: Option<u32>,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+) -> Result<(BackendV3, BackendInfo), V3Error> {
+    // Helper function
+    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+        match max_supported_batch_total_tokens {
+            // Older models do not support automatic max-batch-total-tokens
+            None => {
+                let max_batch_total_tokens = max_batch_total_tokens
+                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
+                tracing::warn!("Model does not support automatic max batch total tokens");
+                Ok(max_batch_total_tokens)
+            }
+            // Flash attention models return their max supported total tokens
+            Some(max_supported_batch_total_tokens) => {
+                // Warn if user added his own max-batch-total-tokens as we will ignore it
+                if max_batch_total_tokens.is_some() {
+                    tracing::warn!(
+                        "`--max-batch-total-tokens` is deprecated for Flash \
+                        Attention models."
+                    );
+                    tracing::warn!(
+                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
+                    );
+                }
+                if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                    return Err(V3Error::NotEnoughMemory(max_total_tokens));
+                }
+
+                Ok(max_supported_batch_total_tokens)
+            }
+        }
+    };
+
+    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
+        .await
+        .map_err(V3Error::Connection)?;
+
+    // server is running on v3
+    // Clear the cache; useful if the webserver rebooted
+    sharded_client
+        .clear_cache(None)
+        .await
+        .map_err(V3Error::Cache)?;
+    // Get info from the shard
+    let shard_info = sharded_client.info().await.map_err(V3Error::Info)?;
+
+    // Warmup model
+    tracing::info!("Warming up model");
+    let max_batch_total_tokens = check_max_batch_total_tokens(
+        sharded_client
+            .warmup(
+                max_input_tokens as u32,
+                max_batch_prefill_tokens,
+                max_total_tokens as u32,
+                max_batch_size,
+            )
+            .await
+            .map_err(V3Error::Warmup)?,
+    )?;
+    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
+
+    let backend_info = BackendInfo {
+        waiting_served_ratio,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        model_device_type: shard_info.device_type.clone(),
+        model_dtype: shard_info.dtype.clone(),
+        speculate: shard_info.speculate as usize,
+    };
+
+    let backend = BackendV3::new(
+        sharded_client,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        shard_info.requires_padding,
+        shard_info.window_size,
+        shard_info.speculate,
+    );
+
+    tracing::info!("Using backend V3");
+
+    Ok((backend, backend_info))
+}
+
+#[derive(Debug, Error)]
+pub enum V3Error {
+    #[error("Unable to clear the Python model shards cache: {0}")]
+    Cache(ClientError),
+    #[error("Unable to connect to the Python model shards: {0}")]
+    Connection(ClientError),
+    #[error("Unable to get the Python model shards info: {0}")]
+    Info(ClientError),
+    #[error("Unable to warmup the Python model shards: {0}")]
+    Warmup(ClientError),
+    #[error("Not enough memory to handle `max_total_tokens={0}`")]
+    NotEnoughMemory(usize),
+}
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@ -0,0 +1,204 @@
+use clap::{Parser, Subcommand};
+use text_generation_router::{server, usage_stats};
+use text_generation_router_v3::{connect_backend, V3Error};
+use thiserror::Error;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[command(subcommand)]
+    command: Option<Commands>,
+
+    #[clap(default_value = "128", long, env)]
+    max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+    #[clap(default_value = "1.2", long, env)]
+    waiting_served_ratio: f32,
+    #[clap(default_value = "4096", long, env)]
+    max_batch_prefill_tokens: u32,
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<u32>,
+    #[clap(default_value = "20", long, env)]
+    max_waiting_tokens: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+    #[clap(default_value = "0.0.0.0", long, env)]
+    hostname: String,
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
+    master_shard_uds_path: String,
+    #[clap(default_value = "bigscience/bloom", long, env)]
+    tokenizer_name: String,
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
+    revision: Option<String>,
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+    #[clap(long, env)]
+    api_key: Option<String>,
+    #[clap(long, env)]
+    json_output: bool,
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+    #[clap(long, env)]
+    ngrok: bool,
+    #[clap(long, env)]
+    ngrok_authtoken: Option<String>,
+    #[clap(long, env)]
+    ngrok_edge: Option<String>,
+    #[clap(long, env, default_value_t = false)]
+    messages_api_enabled: bool,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+}
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    PrintSchema,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        command,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        hostname,
+        port,
+        master_shard_uds_path,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        validation_workers,
+        api_key,
+        json_output,
+        otlp_endpoint,
+        otlp_service_name,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    } = args;
+
+    if let Some(Commands::PrintSchema) = command {
+        use utoipa::OpenApi;
+        let api_doc = text_generation_router::server::ApiDoc::openapi();
+        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    };
+    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
+
+    // Validate args
+    if max_input_tokens >= max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+
+    if validation_workers == 0 {
+        return Err(RouterError::ArgumentValidation(
+            "`validation_workers` must be > 0".to_string(),
+        ));
+    }
+
+    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+        }
+        if max_total_tokens as u32 > *max_batch_total_tokens {
+            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+        }
+    }
+
+    let (backend, _backend_info) = connect_backend(
+        max_input_tokens,
+        max_total_tokens,
+        master_shard_uds_path,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+    )
+    .await?;
+
+    // Run server
+    server::run(
+        backend,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        validation_workers,
+        api_key,
+        tokenizer_name,
+        tokenizer_config_path,
+        revision,
+        hostname,
+        port,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        usage_stats,
+    )
+    .await?;
+    Ok(())
+}
+
+#[derive(Debug, Error)]
+enum RouterError {
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("Backend failed: {0}")]
+    Backend(#[from] V3Error),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Tokio runtime failed to start: {0}")]
+    Tokio(#[from] std::io::Error),
+}
--- a/router/src/infer/v3/queue.rs
+++ b/router/src/infer/v3/queue.rs
@ -1,17 +1,17 @@
-use crate::infer::v3::block_allocator::{BlockAllocation, BlockAllocator};
-use crate::infer::InferError;
-use crate::infer::InferStreamResponse;
-use crate::validation::{
-    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+use crate::block_allocator::{BlockAllocation, BlockAllocator};
+use crate::client;
+use crate::client::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
 use std::cmp::{max, min};
 use std::collections::VecDeque;
-use text_generation_client::v3::{
-    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+use text_generation_router::infer::InferError;
+use text_generation_router::infer::InferStreamResponse;
+use text_generation_router::validation::{
+    Chunk, ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters,
+    ValidStoppingParameters,
 };
-use text_generation_client::ChunksToString;
-use text_generation_client::Input;
 use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Instrument, Span};
@ -337,8 +337,22 @@ impl State {
            batch_requests.push(Request {
                id,
                prefill_logprobs: entry.request.decoder_input_details,
-                input_chunks: Some(Input {
-                    chunks: entry.request.inputs.clone(),
+                input_chunks: Some(client::Input {
+                    chunks: entry
+                        .request
+                        .inputs
+                        .clone()
+                        .into_iter()
+                        .map(|c| client::InputChunk {
+                            chunk: Some(match c {
+                                Chunk::Text(text) => client::Chunk::Text(text),
+                                Chunk::Image(image) => client::Chunk::Image(client::Image {
+                                    data: image.data,
+                                    mimetype: image.mimetype,
+                                }),
+                            }),
+                        })
+                        .collect(),
                }),
                inputs: entry.request.inputs.chunks_to_string(),
                truncate: entry.request.truncate,
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -21,7 +21,7 @@ float-ord = "0.3.2"
 serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
 tabled = "0.14.0"
-text-generation-client = { path = "../router/client" }
+text-generation-client = { path = "../backends/client" }
 thiserror = "1.0.48"
 tokenizers = { workspace = true }
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -1580,16 +1580,11 @@
        "type": "object",
        "required": [
          "model_id",
-          "model_dtype",
-          "model_device_type",
          "max_concurrent_requests",
          "max_best_of",
          "max_stop_sequences",
          "max_input_tokens",
          "max_total_tokens",
-          "waiting_served_ratio",
-          "max_batch_total_tokens",
-          "max_waiting_tokens",
          "validation_workers",
          "max_client_batch_size",
          "router",
@ -1601,18 +1596,6 @@
            "example": "null",
            "nullable": true
          },
-          "max_batch_size": {
-            "type": "integer",
-            "example": "null",
-            "nullable": true,
-            "minimum": 0
-          },
-          "max_batch_total_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": "32000",
-            "minimum": 0
-          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
@ -1644,19 +1627,6 @@
            "example": "2048",
            "minimum": 0
          },
-          "max_waiting_tokens": {
-            "type": "integer",
-            "example": "20",
-            "minimum": 0
-          },
-          "model_device_type": {
-            "type": "string",
-            "example": "cuda"
-          },
-          "model_dtype": {
-            "type": "string",
-            "example": "torch.float16"
-          },
          "model_id": {
            "type": "string",
            "description": "Model info",
@ -1690,11 +1660,6 @@
          "version": {
            "type": "string",
            "example": "0.5.0"
-          },
-          "waiting_served_ratio": {
-            "type": "number",
-            "format": "float",
-            "example": "1.2"
          }
        }
      },
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@ -431,20 +431,18 @@ Options:
          [env: LORA_ADAPTERS=]

 ```
-## DISABLE_USAGE_STATS
+## USAGE_STATS
 ```shell
-      --disable-usage-stats
-          Disable sending of all usage statistics
+      --usage-stats <USAGE_STATS>
+          Control if anonymous usage stats are collected. Options are "on", "off" and "no-stack" Defaul is on
          
-          [env: DISABLE_USAGE_STATS=]
+          [env: USAGE_STATS=]
+          [default: on]

-```
-## DISABLE_CRASH_REPORTS
-```shell
-      --disable-crash-reports
-          Disable sending of crash reports, but allow anonymous usage statistics
-          
-          [env: DISABLE_CRASH_REPORTS=]
+          Possible values:
+          - on:       Default option, usage statistics are collected anonymously
+          - off:      Disables all collection of usage statistics
+          - no-stack: Doesn't send the error stack trace or error type, but allows sending a crash event

 ```
 ## HELP
--- a/docs/source/conceptual/lora.md
+++ b/docs/source/conceptual/lora.md
@ -36,6 +36,18 @@ To use LoRA in TGI, when starting the server, you can specify the list of LoRA m
 LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
 ```

+additionally, you can specify the path to the LoRA models using the `LORA_ADAPTERS_PATH` environment variable. For example:
+
+```bash
+LORA_ADAPTERS=myadapter=/some/path/to/adapter,myadapter2=/another/path/to/adapter
+```
+
+note it's possible to mix adapter_ids with adapter_id=adapter_path e.g.
+
+```bash
+LORA_ADAPTERS=predibase/dbpedia,myadapter=/path/to/dir/
+```
+
 In the server logs, you will see the following message:

 ```txt
--- a/docs/source/usage_statistics.md
+++ b/docs/source/usage_statistics.md
@ -70,4 +70,6 @@ As of release 2.1.2 this is an example of the data collected:

 ## How to opt-out

-You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending specific crash reports, but allows anonymous usage statistics.
+By passing the `--usage-stats` to the text-generation-launcher you can control how much usage statistics are being collected.
+`--usage-stats=no-stack` will not emit the stack traces from errors and the error types, but will continue to send start and stop events
+`--usage-stats=off` will completely disable everything
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -168,6 +168,33 @@ impl std::fmt::Display for RopeScaling {
    }
 }

+#[derive(Clone, Copy, Debug, ValueEnum)]
+pub enum UsageStatsLevel {
+    /// Default option, usage statistics are collected anonymously
+    On,
+    /// Disables all collection of usage statistics
+    Off,
+    /// Doesn't send the error stack trace or error type, but allows sending a crash event
+    NoStack,
+}
+
+impl std::fmt::Display for UsageStatsLevel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // To keep in track with `server`.
+        match self {
+            UsageStatsLevel::On => {
+                write!(f, "on")
+            }
+            UsageStatsLevel::Off => {
+                write!(f, "off")
+            }
+            UsageStatsLevel::NoStack => {
+                write!(f, "no-stack")
+            }
+        }
+    }
+}
+
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
@ -466,13 +493,11 @@ struct Args {
    #[clap(long, env)]
    lora_adapters: Option<String>,

-    /// Disable sending of all usage statistics
-    #[clap(default_value = "false", long, env)]
-    disable_usage_stats: bool,
-
-    /// Disable sending of crash reports, but allow anonymous usage statistics
-    #[clap(default_value = "false", long, env)]
-    disable_crash_reports: bool,
+    /// Control if anonymous usage stats are collected.
+    /// Options are "on", "off" and "no-stack"
+    /// Defaul is on.
+    #[clap(default_value = "on", long, env)]
+    usage_stats: UsageStatsLevel,
 }

 #[derive(Debug)]
@ -1218,12 +1243,8 @@ fn spawn_webserver(
    ];

    // Pass usage stats flags to router
-    if args.disable_usage_stats {
-        router_args.push("--disable-usage-stats".to_string());
-    }
-    if args.disable_crash_reports {
-        router_args.push("--disable-crash-reports".to_string());
-    }
+    router_args.push("--usage-stats".to_string());
+    router_args.push(args.usage_stats.to_string());

    // Grammar support
    if args.disable_grammar_support {
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -7,25 +7,18 @@ edition.workspace = true
 authors.workspace = true
 homepage.workspace = true

-[lib]
-path = "src/lib.rs"
-
-[[bin]]
-name = "text-generation-router"
-path = "src/main.rs"
-
 [dependencies]
+async-trait = "0.1.74"
 async-stream = "0.3.5"
 axum = { version = "0.7", features = ["json"] }
 axum-tracing-opentelemetry = "0.16"
-text-generation-client = { path = "client" }
 clap = { version = "4.4.5", features = ["derive", "env"] }
 futures = "0.3.28"
 hf-hub = { workspace = true }
 itertools = "0.10"
 jsonschema = { version = "0.17.1", features = ["draft202012"] }
-metrics = "0.23.0"
-metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.13.0"
@ -55,6 +48,7 @@ base64 = { workspace = true }
 sysinfo = "0.30.13"
 uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] }
 csv = "1.3.0"
+ureq = "=2.9"


 [build-dependencies]
--- a/router/client/src/v2/pb/.gitignore
+++ b/router/client/src/v2/pb/.gitignore
@ -1 +0,0 @@
-*
--- a/router/client/src/v3/pb/.gitignore
+++ b/router/client/src/v3/pb/.gitignore
@ -1 +0,0 @@
-*
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@ -1,528 +1,85 @@
-/// Batching and inference logic
-use crate::infer::v3::queue::{Entry, Queue};
-use crate::infer::{
-    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
+use crate::infer::InferError;
+use crate::{
+    ChatTemplateInputs, GrammarType, Message, MessageChunk, TextMessage, TokenizerConfigToken,
 };
-use crate::validation::ValidGenerateRequest;
-use crate::{FinishReason, PrefillToken, Token};
-use nohash_hasher::IntMap;
-use std::sync::{
-    atomic::{AtomicBool, Ordering},
-    Arc,
-};
-use text_generation_client::v3::{Batch, CachedBatch, Generation, ShardedClient};
-use text_generation_client::ClientError;
-use tokio::sync::mpsc::error::SendError;
-use tokio::sync::{mpsc, Notify, OwnedSemaphorePermit};
-use tokio::time::Instant;
-use tokio_stream::wrappers::UnboundedReceiverStream;
-use tracing::{info_span, instrument, Instrument, Span};
+use minijinja::{Environment, ErrorKind, Template};
+use minijinja_contrib::pycompat;

-pub(crate) struct SchedulerV3 {
-    /// Request queue
-    queue: Queue,
-    /// Notify batcher on queue appends
-    batching_task_notifier: Arc<Notify>,
+/// Raise a exception (custom function) used in the chat templates
+pub(crate) fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
+    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
 }

-impl SchedulerV3 {
-    #[allow(clippy::too_many_arguments)]
+#[derive(Clone)]
+pub(crate) struct ChatTemplate {
+    template: Template<'static, 'static>,
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+    use_default_tool_template: bool,
+}
+
+impl ChatTemplate {
    pub(crate) fn new(
-        client: ShardedClient,
-        waiting_served_ratio: f32,
-        max_batch_prefill_tokens: u32,
-        max_batch_total_tokens: u32,
-        max_waiting_tokens: usize,
-        max_batch_size: Option<usize>,
-        requires_padding: bool,
-        window_size: Option<u32>,
-        speculate: u32,
-        generation_health: Arc<AtomicBool>,
+        template: String,
+        bos_token: Option<TokenizerConfigToken>,
+        eos_token: Option<TokenizerConfigToken>,
    ) -> Self {
-        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
-            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
-        } else {
-            false
-        };
-        let block_size = if flashdecoding { 256 } else { 16 };
-        let queue = Queue::new(
-            requires_padding,
-            block_size,
-            window_size,
-            speculate,
-            max_batch_total_tokens,
-        );
-        let batching_task_notifier = Arc::new(Notify::new());
+        let mut env = Box::new(Environment::new());
+        // enable things like .strip() or .capitalize()
+        env.set_unknown_method_callback(pycompat::unknown_method_callback);
+        let template_str = template.into_boxed_str();
+        env.add_function("raise_exception", raise_exception);

-        // Spawn batching background task that contains all the inference logic
-        tokio::spawn(batching_task(
-            client,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            max_batch_size,
-            queue.clone(),
-            batching_task_notifier.clone(),
-            generation_health,
-        ));
+        // check if contains the tools variable within the template
+        let use_default_tool_template =
+            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
+        // leaking env and template_str as read-only, static resources for performance.
+        let template = Box::leak(env)
+            .template_from_str(Box::leak(template_str))
+            .unwrap();

        Self {
-            queue,
-            batching_task_notifier,
-        }
+            template,
+            bos_token: bos_token.map(|token| token.as_str().to_string()),
+            eos_token: eos_token.map(|token| token.as_str().to_string()),
+            use_default_tool_template,
        }
    }

-impl Scheduler for SchedulerV3 {
-    #[instrument(skip_all)]
-    fn schedule(
+    pub(crate) fn apply(
        &self,
-        request: ValidGenerateRequest,
-        permit: OwnedSemaphorePermit,
-    ) -> Result<GenerateStreamResponse, InferError> {
-        // MPSC channel to communicate with the background batching task
-        let (response_tx, response_rx) = mpsc::unbounded_channel();
-        let input_length = request.input_length;
-
-        // Append the request to the queue
-        self.queue.append(Entry {
-            request,
-            response_tx,
-            span: Span::current(),
-            temp_span: None,
-            queue_time: Instant::now(),
-            batch_time: None,
-            block_allocation: None,
-        });
-
-        // Notify the background task that we have a new entry in the queue that needs
-        // to be batched
-        self.batching_task_notifier.notify_one();
-
-        // Return stream
-        Ok((
-            permit,
-            input_length,
-            UnboundedReceiverStream::new(response_rx),
-        ))
-    }
-}
-
-/// Batching logic
-/// Will be launched in a background Tokio task
-///
-/// Batches requests and sends them to the inference server
-#[allow(clippy::too_many_arguments)]
-pub(crate) async fn batching_task(
-    mut client: ShardedClient,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
-    max_waiting_tokens: usize,
-    max_batch_size: Option<usize>,
-    queue: Queue,
-    notifier: Arc<Notify>,
-    generation_health: Arc<AtomicBool>,
-) {
-    // Infinite loop
-    loop {
-        // Wait for a notification from the Infer struct
-        notifier.notified().await;
-
-        // Get the next batch from the queue
-        // This batch might be smaller than the maximum batch size if there are not enough requests
-        // waiting in the queue
-        while let Some((mut entries, batch, span)) = queue
-            .next_batch(
-                None,
-                max_batch_size,
-                max_batch_prefill_tokens,
-                max_batch_total_tokens,
-            )
-            .await
-        {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
-                .instrument(span)
-                .await;
-            let mut waiting_tokens = 1;
-
-            // We loop until we do not receive any cached batch from the inference server (== until
-            // all requests have met their stopping criteria)
-            while let Some(batch) = cached_batch {
-                // Get current batch info
-                let batch_size = batch.size;
-                let batch_max_tokens = batch.max_tokens;
-                let mut batches = vec![batch];
-                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
-                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
-
-                let min_size = if waiting_tokens >= max_waiting_tokens {
-                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                    // to add a new batch even though its size might be small
-                    None
-                } else {
-                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
-                };
-
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
-
-                // Try to get a new batch
-                if let Some((mut new_entries, new_batch, span)) = queue
-                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
-                    .await
-                {
-                    // Tracking metrics
-                    if min_size.is_some() {
-                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
-                            .increment(1);
-                    } else {
-                        metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
-                            .increment(1);
-                    }
-
-                    entries.iter_mut().for_each(|(_, entry)| {
-                        // Create a new span to add the info that this entry is waiting
-                        // because a new batch is being computed
-                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                        // Add relationships
-                        span.follows_from(&entry_waiting_span);
-                        entry_waiting_span.follows_from(&span);
-                        // Update entry
-                        entry.temp_span = Some(entry_waiting_span);
-                    });
-
-                    // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch =
-                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
-                            .instrument(span)
-                            .await;
-                    // Reset waiting counter
-                    waiting_tokens = 1;
-                    // Extend current batch with the new batch
-                    if let Some(new_cached_batch) = new_cached_batch {
-                        entries.extend(new_entries);
-                        batches.push(new_cached_batch);
-                    }
-                }
-
-                // Create span for this batch to add context to inference calls
-                let next_batch_size = entries.len();
-                let next_batch_span =
-                    info_span!(parent: None, "batch", batch_size = next_batch_size);
-                entries.iter_mut().for_each(|(_, entry)| {
-                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
-                    // Add relationships
-                    next_batch_span.follows_from(&entry_batch_span);
-                    entry_batch_span.follows_from(&next_batch_span);
-                    // Update entry
-                    entry.temp_span = Some(entry_batch_span);
-                });
-
-                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
-                    .instrument(next_batch_span)
-                    .await;
-                waiting_tokens += 1;
-            }
-            metrics::gauge!("tgi_batch_current_size").set(0.0);
-            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn prefill(
-    client: &mut ShardedClient,
-    batch: Batch,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_id = batch.id;
-    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
-
-    match client.prefill(batch).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
-                .record(timings.forward.as_secs_f64());
-            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
-                .record(timings.decode.as_secs_f64());
-            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
-                .record(start_filtering_time.elapsed().as_secs_f64());
-            metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
-                .record(start_time.elapsed().as_secs_f64());
-            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            // Update health
-            generation_health.store(false, Ordering::SeqCst);
-            let _ = client.clear_cache(Some(batch_id)).await;
-            send_errors(err, entries);
-            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
-            None
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn decode(
-    client: &mut ShardedClient,
-    batches: Vec<CachedBatch>,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
-    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
-
-    match client.decode(batches).await {
-        Ok((generations, next_batch, timings)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-
-            let start_filtering_time = Instant::now();
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            if let Some(concat_duration) = timings.concat {
-                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
-                    .record(concat_duration.as_secs_f64());
-            }
-            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
-                .record(timings.forward.as_secs_f64());
-            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
-                .record(timings.decode.as_secs_f64());
-            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
-                .record(start_filtering_time.elapsed().as_secs_f64());
-            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
-                .record(start_time.elapsed().as_secs_f64());
-            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            generation_health.store(false, Ordering::SeqCst);
-            for id in batch_ids {
-                let _ = client.clear_cache(Some(id)).await;
-            }
-            send_errors(err, entries);
-            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
-            None
-        }
-    }
-}
-
-/// Filter a `batch` and remove all requests not present in `entries`
-#[instrument(skip_all)]
-async fn filter_batch(
-    client: &mut ShardedClient,
-    next_batch: Option<CachedBatch>,
-    entries: &IntMap<u64, Entry>,
-) -> Option<CachedBatch> {
-    let mut batch = next_batch?;
-
-    // No need to filter
-    if batch.size as usize == entries.len() {
-        return Some(batch);
-    }
-
-    let id = batch.id;
-
-    // Retain only requests that are still in entries
-    batch.request_ids.retain(|id| entries.contains_key(id));
-
-    if batch.request_ids.is_empty() {
-        // All requests have been filtered out
-        // Next batch is now empty
-        // Clear it from the Python shards cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.clear_cache(Some(id)).await.unwrap();
-        None
-    } else {
-        // Filter Python shard cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.filter_batch(id, batch.request_ids).await.unwrap()
-    }
-}
-
-/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
-/// and filter entries
-#[instrument(skip_all)]
-fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
-    generations.into_iter().for_each(|generation| {
-        let id = generation.request_id;
-        // Get entry
-        // We can `expect` here as the request id should always be in the entries
-        let entry = entries
-            .get(&id)
-            .expect("ID not found in entries. This is a bug.");
-
-        // Create and enter a span to link this function back to the entry
-        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
-        // Send generation responses back to the infer task
-        // If the receive an error from the Flume channel, it means that the client dropped the
-        // request and we need to stop generating hence why we unwrap_or(true)
-        let stopped = send_responses(generation, entry).map_err(|err| {
-            tracing::error!("Entry response channel error.");
-            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
-            err
-        }).unwrap_or(true);
-        if stopped {
-            entries.remove(&id).expect("ID not found in entries. This is a bug.");
-        }
+        mut messages: Vec<Message>,
+        grammar_with_prompt: Option<(GrammarType, String)>,
+    ) -> Result<String, InferError> {
+        if self.use_default_tool_template {
+            if let Some(last_message) = messages.last_mut() {
+                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
+                    last_message.content.push(MessageChunk::Text {
+                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
                    });
                }
-
-/// Send responses through the `entry` response channel
-fn send_responses(
-    generation: Generation,
-    entry: &Entry,
-) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
-    // Return directly if the channel is disconnected
-    if entry.response_tx.is_closed() {
-        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
-        return Ok(true);
+            }
        }

-    let mut stopped = false;
+        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();

-    if let Some(prefill_tokens) = generation.prefill_tokens {
-        // Create Token objects
-        // We do that here instead of in the Python code as Rust for loops are faster
-        let prefill_tokens = prefill_tokens
-            .ids
-            .into_iter()
-            .zip(prefill_tokens.logprobs)
-            .zip(prefill_tokens.texts)
-            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
-            .collect();
-
-        // Send message
-        entry
-            .response_tx
-            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
-    }
-
-    // Create last Token
-    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
-    let n = tokens_.ids.len();
-    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
-    let mut iterator = tokens_
-        .ids
-        .into_iter()
-        .zip(tokens_.logprobs)
-        .zip(tokens_.texts)
-        .zip(tokens_.is_special)
-        .enumerate()
-        .peekable();
-    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
-        let token = Token {
-            id,
-            text,
-            logprob,
-            special,
-        };
-        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
-            top_tokens_
-                .ids
-                .iter()
-                .zip(top_tokens_.logprobs.iter())
-                .zip(top_tokens_.texts.iter())
-                .zip(top_tokens_.is_special.iter())
-                .map(|(((&id, &logprob), text), &special)| Token {
-                    id,
-                    text: text.to_string(),
-                    logprob,
-                    special,
+        self.template
+            .render(ChatTemplateInputs {
+                messages,
+                bos_token: self.bos_token.as_deref(),
+                eos_token: self.eos_token.as_deref(),
+                add_generation_prompt: true,
+                tools: None,
+                tools_prompt: None,
            })
-                .collect()
-        } else {
-            vec![]
-        };
-        match (&generation.generated_text, iterator.peek()) {
-            (Some(generated_text), None) => {
-                // Generation has ended
-                stopped = true;
-                // Send message
-                entry.response_tx.send(Ok(InferStreamResponse::End {
-                    token,
-                    top_tokens,
-                    generated_text: GeneratedText::from(generated_text.clone()),
-                    queued: entry.queue_time,
-                    start: entry.batch_time.unwrap(),
-                }))?;
-            }
-            _ => {
-                // Send message
-                entry
-                    .response_tx
-                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
-            }
-        }
-    }
-
-    Ok(stopped)
-}
-
-/// Send errors to Infer for all `entries`
-#[instrument(skip_all)]
-fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
-    entries.drain().for_each(|(_, entry)| {
-        // Create and enter a span to link this function back to the entry
-        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
-        let err = InferError::GenerationError(error.to_string());
-        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
-        tracing::error!("{err}");
-
-        // unwrap_or is valid here as we don't care if the receiver is gone.
-        entry
-            .response_tx
-            .send(Err(err))
-            .unwrap_or(());
-    });
-}
-
-impl From<text_generation_client::v3::GeneratedText> for GeneratedText {
-    fn from(value: text_generation_client::v3::GeneratedText) -> Self {
-        let v3_finish_reason =
-            text_generation_client::v3::FinishReason::try_from(value.finish_reason).unwrap();
-        let finish_reason = match v3_finish_reason {
-            text_generation_client::v3::FinishReason::Length => FinishReason::Length,
-            text_generation_client::v3::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
-            text_generation_client::v3::FinishReason::StopSequence => FinishReason::StopSequence,
-        };
-
-        Self {
-            text: value.text,
-            generated_tokens: value.generated_tokens,
-            finish_reason,
-            seed: value.seed,
-        }
+            .map_err(InferError::TemplateError)
    }
 }

 // tests
 #[cfg(test)]
 mod tests {
-    use crate::infer::raise_exception;
+    use crate::infer::chat_template::raise_exception;
    use crate::{ChatTemplateInputs, TextMessage};
    use minijinja::Environment;

--- a/router/src/infer/health.rs
+++ b/router/src/infer/health.rs
@ -1,34 +0,0 @@
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::Health;
-
-#[derive(Clone)]
-pub(crate) struct HealthCheck {
-    client: Arc<dyn Health + Send + Sync>,
-    generation_health: Arc<AtomicBool>,
-}
-
-impl HealthCheck {
-    pub(crate) fn new(
-        client: Arc<dyn Health + Send + Sync>,
-        generation_health: Arc<AtomicBool>,
-    ) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-
-    pub(crate) async fn check(&mut self) -> bool {
-        let value = if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards can allocate on device
-            self.client.device_health().await
-        } else {
-            self.client.model_health().await
-        }
-        .is_ok();
-        // Update generation health
-        self.generation_health.store(value, Ordering::SeqCst);
-        value
-    }
-}
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -1,23 +1,18 @@
-mod health;
-pub(crate) mod v2;
-pub(crate) mod v3;
-
-pub(crate) use health::HealthCheck;
+// pub(crate) mod v2;
+mod chat_template;
+pub mod tool_grammar;

 use crate::validation::{ValidGenerateRequest, Validation, ValidationError};
+use crate::GrammarType;
 use crate::{
-    ChatTemplateInputs, ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig,
-    HubTokenizerConfig, Message, MessageChunk, PrefillToken, TextMessage, Token, ToolChoice,
-};
-use crate::{
-    FunctionRef, FunctionsMap, GrammarType, Properties, TokenizerConfigToken, Tool, ToolType, Tools,
+    ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig, HubTokenizerConfig,
+    Message, PrefillToken, Token,
 };
+use async_trait::async_trait;
+use chat_template::ChatTemplate;
 use futures::future::try_join_all;
-use minijinja::{Environment, ErrorKind, Template};
-use minijinja_contrib::pycompat;
-
-use serde_json::{json, Map, Value};
-use std::collections::HashMap;
+use minijinja::ErrorKind;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use thiserror::Error;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
@ -26,12 +21,14 @@ use tokio_stream::wrappers::UnboundedReceiverStream;
 use tokio_stream::StreamExt;
 use tracing::instrument;

-pub(crate) trait Scheduler {
+#[async_trait]
+pub trait Backend {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
-        permit: OwnedSemaphorePermit,
-    ) -> Result<GenerateStreamResponse, InferError>;
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError>;
+
+    async fn health(&self, current_health: bool) -> bool;
 }

 /// Inference struct
@ -39,18 +36,20 @@ pub(crate) trait Scheduler {
 pub struct Infer {
    /// Validation
    validation: Validation,
-    /// Request scheduler
-    scheduler: Arc<dyn Scheduler + Send + Sync>,
+    /// Request backend
+    backend: Arc<dyn Backend + Send + Sync>,
    /// Chat template
    chat_template: Option<ChatTemplate>,
    /// Inference limit
    limit_concurrent_requests: Arc<Semaphore>,
+    /// Backend health
+    backend_health: Arc<AtomicBool>,
 }

 impl Infer {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
-        scheduler: Arc<dyn Scheduler + Send + Sync>,
+        backend: impl Backend + Send + Sync + 'static,
        validation: Validation,
        max_concurrent_requests: usize,
        tokenizer_config: HubTokenizerConfig,
@ -71,18 +70,22 @@ impl Infer {
        // Inference limit with a semaphore
        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));

+        // Backend health
+        let backend_health = Arc::new(AtomicBool::new(false));
+
        Self {
            validation,
-            scheduler,
+            backend: Arc::new(backend),
            chat_template,
            limit_concurrent_requests: semaphore,
+            backend_health,
        }
    }

    /// Add a new request to the queue and return a stream of InferStreamResponse
    #[instrument(skip_all)]
-    pub(crate) async fn generate_stream(
-        &self,
+    pub(crate) async fn generate_stream<'a>(
+        &'a self,
        request: GenerateRequest,
    ) -> Result<GenerateStreamResponse, InferError> {
        // Limit concurrent requests by acquiring a permit from the semaphore
@ -103,7 +106,10 @@ impl Infer {
            err
        })?;

-        self.scheduler.schedule(valid_request, permit)
+        let input_length = valid_request.input_length;
+        let generation_stream = self.backend.schedule(valid_request)?;
+
+        Ok((permit, input_length, generation_stream))
    }

    /// Tokenizer the input
@ -155,7 +161,7 @@ impl Infer {
        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);

        // Create stream and keep semaphore permit as long as generate lives
-        let (_permit, _input_length, mut stream) = self.generate_stream(request).await?;
+        let (_permit, _input_length, stream) = self.generate_stream(request).await?;

        // Return values
        let mut result_prefill = Vec::new();
@ -165,6 +171,8 @@ impl Infer {
        let mut result_start = None;
        let mut result_queued = None;

+        let mut stream = Box::pin(stream);
+
        // Iterate on stream
        while let Some(response) = stream.next().await {
            match response? {
@ -256,207 +264,15 @@ impl Infer {
        let best_response = infer_responses.remove(max_index);
        Ok((best_response, infer_responses))
    }
-}

-/// Raise a exception (custom function) used in the chat templates
-fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
-    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
-}
-
-#[derive(Clone)]
-struct ChatTemplate {
-    template: Template<'static, 'static>,
-    bos_token: Option<String>,
-    eos_token: Option<String>,
-    use_default_tool_template: bool,
-}
-
-impl ChatTemplate {
-    fn new(
-        template: String,
-        bos_token: Option<TokenizerConfigToken>,
-        eos_token: Option<TokenizerConfigToken>,
-    ) -> Self {
-        let mut env = Box::new(Environment::new());
-        // enable things like .strip() or .capitalize()
-        env.set_unknown_method_callback(pycompat::unknown_method_callback);
-        let template_str = template.into_boxed_str();
-        env.add_function("raise_exception", raise_exception);
-
-        // check if contains the tools variable within the template
-        let use_default_tool_template =
-            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
-        // leaking env and template_str as read-only, static resources for performance.
-        let template = Box::leak(env)
-            .template_from_str(Box::leak(template_str))
-            .unwrap();
-
-        Self {
-            template,
-            bos_token: bos_token.map(|token| token.as_str().to_string()),
-            eos_token: eos_token.map(|token| token.as_str().to_string()),
-            use_default_tool_template,
-        }
-    }
-
-    fn apply(
-        &self,
-        mut messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
-    ) -> Result<String, InferError> {
-        if self.use_default_tool_template {
-            if let Some(last_message) = messages.last_mut() {
-                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
-                    last_message.content.push(MessageChunk::Text {
-                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
-                    });
-                }
-            }
-        }
-
-        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
-
-        self.template
-            .render(ChatTemplateInputs {
-                messages,
-                bos_token: self.bos_token.as_deref(),
-                eos_token: self.eos_token.as_deref(),
-                add_generation_prompt: true,
-                tools: None,
-                tools_prompt: None,
-            })
-            .map_err(InferError::TemplateError)
-    }
-}
-
-pub struct ToolGrammar {}
-
-impl ToolGrammar {
-    // find a tool by name
-    fn find_tool_by_name(tools: &[Tool], name: &str) -> Result<Tool, InferError> {
-        tools
-            .iter()
-            .find(|tool| tool.function.name == name)
-            .cloned()
-            .ok_or_else(|| InferError::ToolError(format!("Tool with name {} not found", name)))
-    }
-
-    pub fn apply(
-        tools: Option<Vec<Tool>>,
-        tool_choice: ToolChoice,
-    ) -> Result<Option<Tools>, InferError> {
-        // if no tools are provided, we return None
-        let tools = match tools {
-            Some(tools) if !tools.is_empty() => tools,
-            _ => return Ok(None),
-        };
-
-        let tool_choice = tool_choice.0.unwrap_or(ToolType::OneOf);
-
-        // if tools are provided and no tool_choice we default to the OneOf
-        let tools_to_use = match tool_choice {
-            ToolType::FunctionName(name) => {
-                vec![Self::find_tool_by_name(&tools, &name)?]
-            }
-            ToolType::Function { function } => {
-                vec![Self::find_tool_by_name(&tools, &function.name)?]
-            }
-            ToolType::OneOf => tools,
-            ToolType::NoTool => return Ok(None),
-        };
-
-        // adds the error notification function for LLM feedback if required
-        let mut text_response_properties = Map::new();
-        text_response_properties.insert(
-            "error".to_string(),
-            serde_json::json!({
-                "type": "string",
-                "description": "The error or issue to notify"
-            }),
-        );
-        text_response_properties.insert(
-            "_name".to_string(),
-            serde_json::json!({
-                "type": "string",
-                "const": "notify_error"
-            }),
-        );
-
-        let functions: HashMap<String, serde_json::Value> = tools_to_use
-            .iter()
-            .map(|tool| {
-                let func = tool.function.clone();
-
-                // Clone the existing parameters, which are expected to be a JSON object
-                let mut params = if let Value::Object(params) = &func.arguments {
-                    params.clone()
-                } else {
-                    Map::new()
-                };
-
-                // Insert the function's description at the top level, outside of properties
-                params.insert(
-                    "description".to_string(),
-                    Value::String(func.description.clone().unwrap_or_default()),
-                );
-
-                // Ensure 'properties' exists and is an object
-                let properties = params
-                    .entry("properties".to_string())
-                    .or_insert_with(|| json!({}))
-                    .as_object_mut()
-                    .unwrap();
-
-                // Insert the constant for the function name inside 'properties'
-                properties.insert(
-                    "_name".to_string(),
-                    json!({
-                        "type": "string",
-                        "const": func.name.clone(),
-                        // "description": "The name of the function"
-                    }),
-                );
-
-                // Check if 'required' exists, and it is an array. If not, create an empty array.
-                let required = params
-                    .entry("required".to_string())
-                    .or_insert_with(|| json!([]))
-                    .as_array_mut()
-                    .unwrap();
-
-                // Add 'name' to the 'required' array if it is not already present
-                if !required.iter().any(|r| r == "_name") {
-                    required.push(json!("_name"));
-                }
-
-                (func.name, Value::Object(params))
-            })
-            .chain([(
-                "notify_error".to_string(),
-                serde_json::json!({
-                    "properties": text_response_properties,
-                    "required": ["error", "_name"],
-                    "type": "object"
-                }),
-            )])
-            .collect();
-
-        let tools = Tools {
-            functions_map: FunctionsMap { functions },
-            properties: Properties {
-                function: tools_to_use
-                    .iter()
-                    .map(|tool| FunctionRef {
-                        ref_path: format!("#/$functions/{}", tool.function.name.clone()),
-                    })
-                    .chain(std::iter::once(FunctionRef {
-                        ref_path: "#/$functions/notify_error".to_string(),
-                    }))
-                    .collect(),
-            },
-        };
-
-        Ok(Some(tools))
+    #[instrument(skip(self))]
+    pub(crate) async fn health(&self) -> bool {
+        let health = self
+            .backend
+            .health(self.backend_health.load(Ordering::SeqCst))
+            .await;
+        self.backend_health.store(health, Ordering::SeqCst);
+        health
    }
 }

@ -468,15 +284,15 @@ pub(crate) type GenerateStreamResponse = (
 );

 #[derive(Debug)]
-pub(crate) struct GeneratedText {
-    pub(crate) text: String,
-    pub(crate) generated_tokens: u32,
-    pub(crate) finish_reason: FinishReason,
-    pub(crate) seed: Option<u64>,
+pub struct GeneratedText {
+    pub text: String,
+    pub generated_tokens: u32,
+    pub finish_reason: FinishReason,
+    pub seed: Option<u64>,
 }

 #[derive(Debug)]
-pub(crate) enum InferStreamResponse {
+pub enum InferStreamResponse {
    // Optional first message
    Prefill(Vec<PrefillToken>),
    // Intermediate messages
--- a/router/src/infer/tool_grammar.rs
+++ b/router/src/infer/tool_grammar.rs
@ -0,0 +1,135 @@
+use crate::infer::InferError;
+use crate::{FunctionRef, FunctionsMap, Properties, Tool, ToolChoice, ToolType, Tools};
+use serde_json::{json, Map, Value};
+use std::collections::HashMap;
+
+pub(crate) struct ToolGrammar {}
+
+impl ToolGrammar {
+    // find a tool by name
+    fn find_tool_by_name(tools: &[Tool], name: &str) -> Result<Tool, InferError> {
+        tools
+            .iter()
+            .find(|tool| tool.function.name == name)
+            .cloned()
+            .ok_or_else(|| InferError::ToolError(format!("Tool with name {} not found", name)))
+    }
+
+    pub fn apply(
+        tools: Option<Vec<Tool>>,
+        tool_choice: ToolChoice,
+    ) -> Result<Option<Tools>, InferError> {
+        // if no tools are provided, we return None
+        let tools = match tools {
+            Some(tools) if !tools.is_empty() => tools,
+            _ => return Ok(None),
+        };
+
+        let tool_choice = tool_choice.0.unwrap_or(ToolType::OneOf);
+
+        // if tools are provided and no tool_choice we default to the OneOf
+        let tools_to_use = match tool_choice {
+            ToolType::FunctionName(name) => {
+                vec![Self::find_tool_by_name(&tools, &name)?]
+            }
+            ToolType::Function { function } => {
+                vec![Self::find_tool_by_name(&tools, &function.name)?]
+            }
+            ToolType::OneOf => tools,
+            ToolType::NoTool => return Ok(None),
+        };
+
+        // adds the error notification function for LLM feedback if required
+        let mut text_response_properties = Map::new();
+        text_response_properties.insert(
+            "error".to_string(),
+            serde_json::json!({
+                "type": "string",
+                "description": "The error or issue to notify"
+            }),
+        );
+        text_response_properties.insert(
+            "_name".to_string(),
+            serde_json::json!({
+                "type": "string",
+                "const": "notify_error"
+            }),
+        );
+
+        let functions: HashMap<String, serde_json::Value> = tools_to_use
+            .iter()
+            .map(|tool| {
+                let func = tool.function.clone();
+
+                // Clone the existing parameters, which are expected to be a JSON object
+                let mut params = if let Value::Object(params) = &func.arguments {
+                    params.clone()
+                } else {
+                    Map::new()
+                };
+
+                // Insert the function's description at the top level, outside of properties
+                params.insert(
+                    "description".to_string(),
+                    Value::String(func.description.clone().unwrap_or_default()),
+                );
+
+                // Ensure 'properties' exists and is an object
+                let properties = params
+                    .entry("properties".to_string())
+                    .or_insert_with(|| json!({}))
+                    .as_object_mut()
+                    .unwrap();
+
+                // Insert the constant for the function name inside 'properties'
+                properties.insert(
+                    "_name".to_string(),
+                    json!({
+                        "type": "string",
+                        "const": func.name.clone(),
+                        // "description": "The name of the function"
+                    }),
+                );
+
+                // Check if 'required' exists, and it is an array. If not, create an empty array.
+                let required = params
+                    .entry("required".to_string())
+                    .or_insert_with(|| json!([]))
+                    .as_array_mut()
+                    .unwrap();
+
+                // Add 'name' to the 'required' array if it is not already present
+                if !required.iter().any(|r| r == "_name") {
+                    required.push(json!("_name"));
+                }
+
+                (func.name, Value::Object(params))
+            })
+            .chain([(
+                "notify_error".to_string(),
+                serde_json::json!({
+                    "properties": text_response_properties,
+                    "required": ["error", "_name"],
+                    "type": "object"
+                }),
+            )])
+            .collect();
+
+        let tools = Tools {
+            functions_map: FunctionsMap { functions },
+            properties: Properties {
+                function: tools_to_use
+                    .iter()
+                    .map(|tool| FunctionRef {
+                        ref_path: format!("#/$functions/{}", tool.function.name.clone()),
+                    })
+                    .chain(std::iter::once(FunctionRef {
+                        ref_path: "#/$functions/notify_error".to_string(),
+                    }))
+                    .collect(),
+            },
+        };
+
+        Ok(Some(tools))
+    }
+}
--- a/router/src/infer/v2/mod.rs
+++ b/router/src/infer/v2/mod.rs
@ -1,4 +1,4 @@
 mod queue;
 mod scheduler;

-pub(crate) use scheduler::SchedulerV2;
+pub(crate) use scheduler::BackendV2;
--- a/router/src/infer/v2/scheduler.rs
+++ b/router/src/infer/v2/scheduler.rs
@ -1,7 +1,7 @@
 /// Batching and inference logic
 use crate::infer::v2::queue::{Entry, Queue};
 use crate::infer::{
-    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
+    Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
 };
 use crate::validation::ValidGenerateRequest;
 use crate::{FinishReason, PrefillToken, Token};
@ -18,14 +18,14 @@ use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{info_span, instrument, Instrument, Span};

-pub(crate) struct SchedulerV2 {
+pub(crate) struct BackendV2 {
    /// Request queue
    queue: Queue,
    /// Notify batcher on queue appends
    batching_task_notifier: Arc<Notify>,
 }

-impl SchedulerV2 {
+impl BackendV2 {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        client: ShardedClient,
@ -69,7 +69,7 @@ impl SchedulerV2 {
    }
 }

-impl Scheduler for SchedulerV2 {
+impl Backend for BackendV2 {
    #[instrument(skip_all)]
    fn schedule(
        &self,
--- a/router/src/infer/v3/mod.rs
+++ b/router/src/infer/v3/mod.rs
@ -1,5 +0,0 @@
-mod block_allocator;
-mod queue;
-mod scheduler;
-
-pub(crate) use scheduler::SchedulerV3;
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -1,11 +1,12 @@
 /// Text Generation Inference Webserver
 pub mod config;
-mod infer;
+pub mod infer;
 pub mod server;
-mod validation;
+pub mod validation;

 #[cfg(feature = "kserve")]
 mod kserve;
+pub mod logging;

 pub mod usage_stats;

@ -148,12 +149,13 @@ pub struct Info {
    pub model_id: String,
    #[schema(nullable = true, example = "e985a63cdc139290c5f700ff1929f0b5942cced2")]
    pub model_sha: Option<String>,
-    #[schema(example = "torch.float16")]
-    pub model_dtype: String,
-    #[schema(example = "cuda")]
-    pub model_device_type: String,
+    // #[schema(example = "torch.float16")]
+    // pub model_dtype: String,
+    // #[schema(example = "cuda")]
+    // pub model_device_type: String,
    #[schema(nullable = true, example = "text-generation")]
    pub model_pipeline_tag: Option<String>,
+
    /// Router Parameters
    #[schema(example = "128")]
    pub max_concurrent_requests: usize,
@ -165,18 +167,11 @@ pub struct Info {
    pub max_input_tokens: usize,
    #[schema(example = "2048")]
    pub max_total_tokens: usize,
-    #[schema(example = "1.2")]
-    pub waiting_served_ratio: f32,
-    #[schema(example = "32000")]
-    pub max_batch_total_tokens: u32,
-    #[schema(example = "20")]
-    pub max_waiting_tokens: usize,
-    #[schema(nullable = true, example = "null")]
-    pub max_batch_size: Option<usize>,
    #[schema(example = "2")]
    pub validation_workers: usize,
    #[schema(example = "32")]
    pub max_client_batch_size: usize,
+
    /// Router Info
    #[schema(example = "text-generation-router")]
    pub router: &'static str,
@ -624,7 +619,7 @@ impl ChatCompletion {
                message,
                logprobs: return_logprobs
                    .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
-                finish_reason: details.finish_reason.to_string(),
+                finish_reason: details.finish_reason.format(true),
            }],
            usage: Usage {
                prompt_tokens: details.prefill.len() as u32,
@ -1068,23 +1063,23 @@ impl From<CompatGenerateRequest> for GenerateRequest {
 #[derive(Debug, Serialize, ToSchema)]
 pub struct PrefillToken {
    #[schema(example = 0)]
-    id: u32,
+    pub id: u32,
    #[schema(example = "test")]
-    text: String,
+    pub text: String,
    #[schema(nullable = true, example = - 0.34)]
-    logprob: f32,
+    pub logprob: f32,
 }

 #[derive(Debug, Serialize, ToSchema, Clone)]
 pub struct Token {
    #[schema(example = 0)]
-    id: u32,
+    pub id: u32,
    #[schema(example = "test")]
-    text: String,
+    pub text: String,
    #[schema(nullable = true, example = - 0.34)]
-    logprob: f32,
+    pub logprob: f32,
    #[schema(example = "false")]
-    special: bool,
+    pub special: bool,
 }

 #[derive(Debug, Serialize, ToSchema)]
@ -1102,7 +1097,7 @@ pub struct SimpleToken {
 #[derive(Debug, Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 #[schema(example = "Length")]
-pub(crate) enum FinishReason {
+pub enum FinishReason {
    #[schema(rename = "length")]
    Length,
    #[serde(rename = "eos_token")]
@ -1122,6 +1117,15 @@ impl std::fmt::Display for FinishReason {
    }
 }

+impl FinishReason {
+    pub fn format(&self, use_stop: bool) -> String {
+        match self {
+            FinishReason::EndOfSequenceToken if use_stop => "stop".to_string(),
+            _ => self.to_string(),
+        }
+    }
+}
+
 #[derive(Serialize, ToSchema)]
 pub(crate) struct BestOfSequence {
    #[schema(example = "test")]
@ -1162,6 +1166,12 @@ pub(crate) struct GenerateResponse {
    pub details: Option<Details>,
 }

+#[derive(Serialize, ToSchema)]
+pub(crate) struct ChatTokenizeResponse {
+    pub(crate) tokenize_response: TokenizeResponse,
+    pub(crate) templated_text: String,
+}
+
 #[derive(Serialize, ToSchema)]
 #[serde(transparent)]
 pub(crate) struct TokenizeResponse(Vec<SimpleToken>);
--- a/router/src/logging.rs
+++ b/router/src/logging.rs
@ -0,0 +1,81 @@
+use opentelemetry::sdk::propagation::TraceContextPropagator;
+use opentelemetry::sdk::trace;
+use opentelemetry::sdk::trace::Sampler;
+use opentelemetry::sdk::Resource;
+use opentelemetry::{global, KeyValue};
+use opentelemetry_otlp::WithExportConfig;
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::util::SubscriberInitExt;
+use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
+
+/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
+///     - otlp_endpoint is an optional URL to an Open Telemetry collector
+///     - otlp_service_name service name to appear in APM
+///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
+///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
+///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
+pub fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
+    let mut layers = Vec::new();
+
+    // STDOUT/STDERR layer
+    let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string());
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_file(true)
+        .with_ansi(ansi)
+        .with_line_number(true);
+
+    let fmt_layer = match json_output {
+        true => fmt_layer.json().flatten_event(true).boxed(),
+        false => fmt_layer.boxed(),
+    };
+    layers.push(fmt_layer);
+
+    // OpenTelemetry tracing layer
+    if let Some(otlp_endpoint) = otlp_endpoint {
+        global::set_text_map_propagator(TraceContextPropagator::new());
+
+        let tracer = opentelemetry_otlp::new_pipeline()
+            .tracing()
+            .with_exporter(
+                opentelemetry_otlp::new_exporter()
+                    .tonic()
+                    .with_endpoint(otlp_endpoint),
+            )
+            .with_trace_config(
+                trace::config()
+                    .with_resource(Resource::new(vec![KeyValue::new(
+                        "service.name",
+                        otlp_service_name,
+                    )]))
+                    .with_sampler(Sampler::AlwaysOn),
+            )
+            .install_batch(opentelemetry::runtime::Tokio);
+
+        if let Ok(tracer) = tracer {
+            layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
+            init_tracing_opentelemetry::init_propagator().unwrap();
+        };
+    }
+
+    // Filter events with LOG_LEVEL
+    let varname = "LOG_LEVEL";
+    let env_filter = if let Ok(log_level) = std::env::var(varname) {
+        // Override to avoid simple logs to be spammed with tokio level informations
+        let log_level = match &log_level[..] {
+            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
+            "info" => "text_generation_launcher=info,text_generation_router=info",
+            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
+            log_level => log_level,
+        };
+        EnvFilter::builder()
+            .with_default_directive(LevelFilter::INFO.into())
+            .parse_lossy(log_level)
+    } else {
+        EnvFilter::new("info")
+    };
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(layers)
+        .init();
+}
--- a/router/src/main.rs.back
+++ b/router/src/main.rs.back
--- a/router/src/server.rs
+++ b/router/src/server.rs
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@ -1,4 +1,5 @@
 use crate::config::Config;
+use clap::ValueEnum;
 use csv::ReaderBuilder;
 use reqwest::header::HeaderMap;
 use serde::Serialize;
@ -13,6 +14,13 @@ use uuid::Uuid;

 const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";

+#[derive(Copy, Clone, Debug, Serialize, ValueEnum)]
+pub enum UsageStatsLevel {
+    On,
+    NoStack,
+    Off,
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct UserAgent {
    pub uid: String,
@ -71,72 +79,69 @@ impl UsageStatsEvent {
 #[derive(Debug, Clone, Serialize)]
 pub struct Args {
    model_config: Option<Config>,
-    tokenizer_config: Option<String>,
+    tokenizer_class: Option<String>,
    max_concurrent_requests: usize,
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
    max_input_tokens: usize,
    max_total_tokens: usize,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: Option<u32>,
-    max_waiting_tokens: usize,
-    max_batch_size: Option<usize>,
+    // waiting_served_ratio: f32,
+    // max_batch_prefill_tokens: u32,
+    // max_batch_total_tokens: Option<u32>,
+    // max_waiting_tokens: usize,
+    // max_batch_size: Option<usize>,
    revision: Option<String>,
    validation_workers: usize,
    messages_api_enabled: bool,
    disable_grammar_support: bool,
    max_client_batch_size: usize,
-    disable_usage_stats: bool,
-    disable_crash_reports: bool,
+    usage_stats_level: UsageStatsLevel,
 }

 impl Args {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        model_config: Option<Config>,
-        tokenizer_config: Option<String>,
+        tokenizer_class: Option<String>,
        max_concurrent_requests: usize,
        max_best_of: usize,
        max_stop_sequences: usize,
        max_top_n_tokens: u32,
        max_input_tokens: usize,
        max_total_tokens: usize,
-        waiting_served_ratio: f32,
-        max_batch_prefill_tokens: u32,
-        max_batch_total_tokens: Option<u32>,
-        max_waiting_tokens: usize,
-        max_batch_size: Option<usize>,
+        // waiting_served_ratio: f32,
+        // max_batch_prefill_tokens: u32,
+        // max_batch_total_tokens: Option<u32>,
+        // max_waiting_tokens: usize,
+        // max_batch_size: Option<usize>,
        revision: Option<String>,
        validation_workers: usize,
        messages_api_enabled: bool,
        disable_grammar_support: bool,
        max_client_batch_size: usize,
-        disable_usage_stats: bool,
-        disable_crash_reports: bool,
+        usage_stats_level: UsageStatsLevel,
    ) -> Self {
        Self {
            model_config,
-            tokenizer_config,
+            tokenizer_class,
            max_concurrent_requests,
            max_best_of,
            max_stop_sequences,
            max_top_n_tokens,
            max_input_tokens,
            max_total_tokens,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            max_batch_size,
+            // waiting_served_ratio,
+            // max_batch_prefill_tokens,
+            // max_batch_total_tokens,
+            // max_waiting_tokens,
+            // max_batch_size,
            revision,
            validation_workers,
            messages_api_enabled,
            disable_grammar_support,
            max_client_batch_size,
-            disable_usage_stats,
-            disable_crash_reports,
+            usage_stats_level,
        }
    }
 }
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -5,13 +5,12 @@ use crate::{
    GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
 };
 use base64::{engine::general_purpose::STANDARD, Engine};
-use image::{io::Reader as ImageReader, ImageFormat};
+use image::{ImageFormat, ImageReader};
 use jsonschema::{Draft, JSONSchema};
 use rand::{thread_rng, Rng};
 use serde_json::Value;
 use std::io::Cursor;
 use std::iter;
-use text_generation_client::{Chunk, Image, InputChunk};
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
 use tokio::sync::mpsc;
@ -96,7 +95,7 @@ impl Validation {
        &self,
        inputs: String,
        truncate: Option<usize>,
-    ) -> Result<Option<(tokenizers::Encoding, Vec<InputChunk>)>, ValidationError> {
+    ) -> Result<Option<(tokenizers::Encoding, Vec<Chunk>)>, ValidationError> {
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
@ -122,7 +121,7 @@ impl Validation {
        inputs: String,
        truncate: Option<usize>,
        max_new_tokens: Option<u32>,
-    ) -> Result<(Vec<InputChunk>, usize, u32), ValidationError> {
+    ) -> Result<(Vec<Chunk>, usize, u32), ValidationError> {
        // If we have a fast tokenizer
        if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
            // Create response channel
@ -181,11 +180,7 @@ impl Validation {
                input_length = input_length.saturating_sub(max_new_tokens as usize);
            }

-            Ok((
-                vec![Chunk::Text(inputs).into()],
-                input_length,
-                max_new_tokens,
-            ))
+            Ok((vec![Chunk::Text(inputs)], input_length, max_new_tokens))
        }
    }

@ -589,7 +584,7 @@ fn prepare_input(
    tokenizer: &Tokenizer,
    config: Option<&Config>,
    preprocessor_config: Option<&HubPreprocessorConfig>,
-) -> Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError> {
+) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
    use Config::*;
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
@ -601,16 +596,16 @@ fn prepare_input(
                let chunk_start = chunk.start();
                let chunk_end = chunk.end();
                if chunk_start != start {
-                    input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()).into());
+                    input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()));
                    tokenizer_query.push_str(&inputs[start..chunk_start]);
                }
                let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
-                input_chunks.push(Chunk::Image(Image { data, mimetype }).into());
+                input_chunks.push(Chunk::Image(Image { data, mimetype }));
                tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
                start = chunk_end;
            }
            if start != inputs.len() {
-                input_chunks.push(Chunk::Text(inputs[start..].to_string()).into());
+                input_chunks.push(Chunk::Text(inputs[start..].to_string()));
                tokenizer_query.push_str(&inputs[start..]);
            }

@ -618,7 +613,7 @@ fn prepare_input(

            (tokenizer_query, input_chunks)
        }
-        _ => (inputs.clone(), vec![Chunk::Text(inputs).into()]),
+        _ => (inputs.clone(), vec![Chunk::Text(inputs)]),
    };

    // Get the number of tokens in the input
@ -631,18 +626,51 @@ fn prepare_input(

 type TokenizerRequest = (
    (String, Option<usize>),
-    oneshot::Sender<Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError>>,
+    oneshot::Sender<Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError>>,
    Span,
 );

+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct Image {
+    pub data: Vec<u8>,
+    pub mimetype: String,
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum Chunk {
+    Text(String),
+    Image(Image),
+}
+
+/// Convert input chunks to a stringly-typed input for backwards
+/// compat for backends that haven't implemented chunked inputs.
+pub trait ChunksToString {
+    /// Convert chunks to string.
+    fn chunks_to_string(&self) -> String;
+}
+
+impl ChunksToString for Vec<Chunk> {
+    fn chunks_to_string(&self) -> String {
+        let mut output = String::new();
+        self.iter().for_each(|c| match &c {
+            Chunk::Text(text) => output.push_str(text),
+            Chunk::Image(Image { data, mimetype }) => {
+                let encoded = STANDARD.encode(data);
+                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
+            }
+        });
+        output
+    }
+}
+
 #[derive(Debug, Clone)]
-pub(crate) enum ValidGrammar {
+pub enum ValidGrammar {
    Json(String),
    Regex(String),
 }

 #[derive(Debug, Clone)]
-pub(crate) struct ValidParameters {
+pub struct ValidParameters {
    /// / exponential scaling output probability distribution
    pub temperature: f32,
    /// / restricting to the k highest probability elements
@ -666,7 +694,7 @@ pub(crate) struct ValidParameters {
 }

 #[derive(Debug, Clone)]
-pub(crate) struct ValidStoppingParameters {
+pub struct ValidStoppingParameters {
    /// / Maximum number of generated tokens
    pub max_new_tokens: u32,
    /// / Optional stopping sequences
@ -677,8 +705,8 @@ pub(crate) struct ValidStoppingParameters {
 }

 #[derive(Debug, Clone)]
-pub(crate) struct ValidGenerateRequest {
-    pub inputs: Vec<InputChunk>,
+pub struct ValidGenerateRequest {
+    pub inputs: Vec<Chunk>,
    pub input_length: u32,
    pub truncate: u32,
    pub decoder_input_details: bool,
@ -750,6 +778,8 @@ pub enum ValidationError {
    InvalidImageContent(String),
    #[error("Could not fetch image: {0}")]
    FailedFetchImage(#[from] reqwest::Error),
+    #[error("{0} modality is not supported")]
+    UnsupportedModality(&'static str),
 }

 #[cfg(test)]
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -34,7 +34,6 @@ def reshape_and_cache(


 def paged_attention(
-    out: torch.Tensor,
    query: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
@ -85,7 +84,7 @@ def paged_attention(
        # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
        if softcap is None:
            softcap = 0.0
-        out2 = flash_attn_2_cuda.varlen_fwd(
+        out = flash_attn_2_cuda.varlen_fwd(
            query,
            key_cache,
            value_cache,
@ -108,13 +107,15 @@ def paged_attention(
            False,  # return softmax
            None,  # generator
        )
-        return out2[0]
+        return out[0]
    else:
        if softcap is not None:
            raise RuntimeError("Paged attention doesn't support softcapping")
        input_lengths = seqlen.input_lengths
        from vllm._C import ops

+        out = torch.empty_like(query)
+
        use_v1 = max_s <= 8192 and (
            max_num_partitions == 1 or num_seqs * num_heads > 512
        )
@ -171,6 +172,10 @@ def paged_attention(


 try:
+    is_ampere_or_newer = major >= 8 and minor >= 0
+    if not is_ampere_or_newer:
+        raise ImportError("FlashAttention only supports Ampere GPUs or newer.")
+
    import flash_attn_2_cuda

    V2 = True
@ -200,13 +205,13 @@ except ImportError:


 SUPPORTS_WINDOWING = V2
+
 if V2:

    def attention(
        q,
        k,
        v,
-        out,
        cu_seqlens,
        max_s,
        softmax_scale,
@ -214,6 +219,7 @@ if V2:
        causal=True,
        softcap=0.0,
    ):
+        out = torch.empty_like(q)
        if window_size_left <= 0 and window_size_left != -1:
            raise ValueError("`window_size_left` must be > 0 or -1")
        return flash_attn_2_cuda.varlen_fwd(
@ -238,7 +244,7 @@ if V2:
            softcap,
            False,
            None,
-        )
+        )[0]

 else:

@ -246,7 +252,6 @@ else:
        q,
        k,
        v,
-        out,
        cu_seqlens,
        max_s,
        softmax_scale,
@ -286,7 +291,8 @@ else:
                    .reshape(original_shape[0], -1, original_shape[2])
                )

-        return flash_attn_cuda.fwd(
+        out = torch.empty_like(q)
+        flash_attn_cuda.fwd(
            q,
            k,
            v,
@ -303,3 +309,4 @@ else:
            0,
            None,
        )
+        return out
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@ -11,7 +11,6 @@ def attention(
    q,
    k,
    v,
-    out,
    cu_seqlens,
    max_s,
    softmax_scale,
@ -19,6 +18,8 @@ def attention(
    causal=True,
    softcap: Optional[float] = None,
 ):
+    out = torch.empty_like(q)
+
    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
    return ipex.llm.functional.varlen_attention(
        q,
@ -51,7 +52,6 @@ def reshape_and_cache(


 def paged_attention(
-    out: torch.Tensor,
    query: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
@ -62,6 +62,7 @@ def paged_attention(
    max_s: int,
    softcap: Optional[float] = None,
 ):
+    out = torch.empty_like(query)
    ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
        out,
        query,
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -39,7 +39,6 @@ def reshape_and_cache(


 def paged_attention(
-    out: torch.Tensor,
    query: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
@ -72,6 +71,8 @@ def paged_attention(
    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
    input_lengths = input_lengths.input_lengths

+    out = torch.empty_like(query)
+
    # NOTE(woosuk): We use a simple heuristic to decide whether to use
    # PagedAttention V1 or V2. If the number of partitions is 1, we use
    # V1 to avoid the overhead of reduction. Also, if the number of
@ -174,7 +175,6 @@ if ENGINE == "ck":
        q,
        k,
        v,
-        out,
        cu_seqlens,
        max_s,
        softmax_scale,
@ -184,6 +184,8 @@ if ENGINE == "ck":
        if window_size_left <= 0 and window_size_left != -1:
            raise ValueError("`window_size_left` must be > 0 or -1")

+        out = torch.empty_like(q)
+
        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
        return flash_attn_2_cuda.varlen_fwd(
            q,
@ -209,13 +211,14 @@ elif ENGINE == "triton":
        q,
        k,
        v,
-        out,
        cu_seqlens,
        max_s,
        softmax_scale,
        window_size_left=-1,
        causal=True,
    ):
+        out = torch.empty_like(q)
+
        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
        output, _ = triton_attention(
            q,
--- a/server/text_generation_server/layers/gptq/init.py
+++ b/server/text_generation_server/layers/gptq/init.py
@ -124,50 +124,7 @@ class GPTQWeightsLoader(WeightsLoader):
        self.sym = sym

    def get_weights(self, weights: Weights, prefix: str):
-        from text_generation_server.layers.marlin import (
-            can_use_gptq_marlin,
-            repack_gptq_for_marlin,
-        )
-
        self._get_gptq_params(weights)
-        if can_use_gptq_marlin(
-            bits=self.bits,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            quantize=self.quantize,
-            sym=self.sym,
-        ):
-            log_once(logger.info, "Using GPTQ-Marlin kernels")
-            try:
-                qweight = weights.get_tensor(f"{prefix}.qweight")
-            except RuntimeError:
-                raise RuntimeError(
-                    f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
-                )
-
-            if not self.sym:
-                qzeros = weights.get_tensor(f"{prefix}.qzeros")
-            else:
-                qzeros = None
-
-            if self.quant_method == "awq":
-                g_idx = None
-            else:
-                g_idx = weights.get_tensor(f"{prefix}.g_idx")
-            scales = weights.get_tensor(f"{prefix}.scales")
-
-            return repack_gptq_for_marlin(
-                qweight=qweight,
-                scales=scales,
-                qzeros=qzeros,
-                g_idx=g_idx,
-                bits=self.bits,
-                desc_act=self.desc_act,
-                groupsize=self.groupsize,
-                quant_method=self.quant_method,
-                sym=self.sym,
-                sharded_infeatures=False,
-            )

        use_exllama = True
        if self.bits != 4:
@ -248,11 +205,6 @@ class GPTQWeightsLoader(WeightsLoader):
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
-        from text_generation_server.layers.marlin import (
-            can_use_gptq_marlin,
-            repack_gptq_for_marlin,
-        )
-
        try:
            qweight = weights.get_packed_sharded(
                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
@ -267,36 +219,6 @@ class GPTQWeightsLoader(WeightsLoader):
        scales = scales.to(dtype=weights.dtype)

        self._get_gptq_params(weights)
-        if can_use_gptq_marlin(
-            bits=self.bits,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            quantize=self.quantize,
-            sym=self.sym,
-        ):
-            if not self.sym:
-                qzeros = weights.get_packed_sharded(
-                    f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
-                )
-            else:
-                qzeros = None
-
-            if self.quant_method == "awq":
-                g_idx = None
-            else:
-                g_idx = weights.get_tensor(f"{prefix}.g_idx")
-            return repack_gptq_for_marlin(
-                qweight=qweight,
-                scales=scales,
-                qzeros=qzeros,
-                g_idx=g_idx,
-                bits=self.bits,
-                desc_act=self.desc_act,
-                groupsize=self.groupsize,
-                quant_method=self.quant_method,
-                sym=self.sym,
-                sharded_infeatures=False,
-            )

        qzeros = weights.get_packed_sharded(
            f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
@ -334,11 +256,6 @@ class GPTQWeightsLoader(WeightsLoader):
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
-        from text_generation_server.layers.marlin import (
-            can_use_gptq_marlin,
-            repack_gptq_for_marlin,
-        )
-
        try:
            qweight = torch.cat(
                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
@ -353,41 +270,6 @@ class GPTQWeightsLoader(WeightsLoader):
        )

        self._get_gptq_params(weights)
-        if can_use_gptq_marlin(
-            bits=self.bits,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            quantize=self.quantize,
-            sym=self.sym,
-        ):
-
-            if not self.sym:
-                qzeros = torch.cat(
-                    [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
-                )
-            else:
-                qzeros = None
-
-            if self.quant_method == "awq":
-                g_idx = None
-            else:
-                w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
-                for w2 in w[1:]:
-                    torch.testing.assert_close(w2, w[0])
-                g_idx = w[0]
-
-            return repack_gptq_for_marlin(
-                qweight=qweight,
-                scales=scales,
-                qzeros=qzeros,
-                g_idx=g_idx,
-                bits=self.bits,
-                desc_act=self.desc_act,
-                groupsize=self.groupsize,
-                quant_method=self.quant_method,
-                sym=self.sym,
-                sharded_infeatures=False,
-            )

        qzeros = torch.cat(
            [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
@ -441,59 +323,7 @@ class GPTQWeightsLoader(WeightsLoader):
        )

    def get_weights_row(self, weights: Weights, prefix: str):
-        from text_generation_server.layers.marlin import (
-            can_use_gptq_marlin,
-            repack_gptq_for_marlin,
-        )
-
        self._get_gptq_params(weights)
-        if can_use_gptq_marlin(
-            bits=self.bits,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            quantize=self.quantize,
-            sym=self.sym,
-        ):
-            log_once(logger.info, "Using GPTQ-Marlin kernels")
-            try:
-                qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
-            except RuntimeError:
-                raise RuntimeError(
-                    f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
-                )
-
-            if not self.sym:
-                if self.desc_act or self.groupsize == -1:
-                    qzeros = weights.get_tensor(f"{prefix}.qzeros")
-                else:
-                    qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
-            else:
-                qzeros = None
-
-            if self.quant_method == "awq":
-                g_idx = None
-            else:
-                g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
-
-            if self.desc_act or self.groupsize == -1:
-                scales = weights.get_tensor(f"{prefix}.scales")
-            else:
-                scales = weights.get_sharded(f"{prefix}.scales", dim=0)
-
-            sharded_in_features = weights.process_group.size() > 1
-
-            return repack_gptq_for_marlin(
-                qweight=qweight,
-                scales=scales,
-                qzeros=qzeros,
-                g_idx=g_idx,
-                bits=self.bits,
-                desc_act=self.desc_act,
-                groupsize=self.groupsize,
-                quant_method=self.quant_method,
-                sym=self.sym,
-                sharded_infeatures=sharded_in_features,
-            )

        use_exllama = True
        if self.bits != 4:
--- a/server/text_generation_server/layers/gptq/quantize.py
+++ b/server/text_generation_server/layers/gptq/quantize.py
@ -17,7 +17,7 @@ from loguru import logger
 from typing import Optional
 from text_generation_server.layers.gptq.utils import torch_snr_error

-from text_generation_server.utils.weights import DefaultWeightsLoader
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight

 DEV = torch.device("cuda:0")

@ -897,7 +897,7 @@ def quantize(
        dtype=torch.float16,
        process_group=process_group,
        aliases={"embed_tokens.weight": ["lm_head.weight"]},
-        weights_loader=DefaultWeightsLoader(),
+        weights_loader=DefaultWeightsLoader(UnquantizedWeight),
    )
    hooks = []
    for name, module in model.named_modules():
@ -960,9 +960,6 @@ def quantize(

    state_dict = model.state_dict()
    state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
-    state_dict["gptq_bits"] = torch.LongTensor([bits])
-    state_dict["gptq_groupsize"] = torch.LongTensor([groupsize])
-    state_dict["gptq_sym"] = torch.BoolTensor([sym])

    max_shard_size = "10GB"
    shards, index = shard_checkpoint(
@ -994,6 +991,15 @@ def quantize(
            f"index located at {save_index_file}."
        )
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+    config.quantization_config = {
+        "bits": bits,
+        "group_size": groupsize,
+        "damp_percent": percdamp,
+        "desc_act": act_order,
+        "static_groups": False,
+        "sym": sym,
+        "quant_method": "gptq",
+    }
    config.save_pretrained(output_dir)
    logger.info("Saved config")
    logger.info("Saving tokenizer")
--- a/server/text_generation_server/layers/marlin/init.py
+++ b/server/text_generation_server/layers/marlin/init.py
@ -1,7 +1,6 @@
 from text_generation_server.layers.marlin.fp8 import GPTQMarlinFP8Linear
 from text_generation_server.layers.marlin.gptq import (
-    GPTQMarlinLinear,
-    GPTQMarlinWeight,
+    GPTQMarlinWeightsLoader,
    can_use_gptq_marlin,
    repack_gptq_for_marlin,
 )
@ -9,8 +8,7 @@ from text_generation_server.layers.marlin.marlin import MarlinWeightsLoader

 __all__ = [
    "GPTQMarlinFP8Linear",
-    "GPTQMarlinLinear",
-    "GPTQMarlinWeight",
+    "GPTQMarlinWeightsLoader",
    "MarlinWeightsLoader",
    "can_use_gptq_marlin",
    "repack_gptq_for_marlin",
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional, Union

 import numpy
 import torch
@ -13,7 +13,7 @@ from text_generation_server.layers.marlin.util import (
 )
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.log import log_once
-from text_generation_server.utils.weights import Weight
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader

 try:
    import marlin_kernels
@ -48,6 +48,204 @@ def can_use_gptq_marlin(
    )


+class GPTQMarlinWeightsLoader(WeightsLoader):
+    """
+    Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.
+    """
+
+    def __init__(
+        self,
+        *,
+        bits: int,
+        desc_act: bool,
+        groupsize: int,
+        quant_method: str,
+        quantize: str,
+        sym: bool,
+    ):
+        self.bits = bits
+        self.desc_act = desc_act
+        self.groupsize = groupsize
+        self.quant_method = quant_method
+        self.quantize = quantize
+        self.sym = sym
+
+    def get_weights(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            qweight = weights.get_tensor(f"{prefix}.qweight")
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+            )
+
+        if not self.sym:
+            qzeros = weights.get_tensor(f"{prefix}.qzeros")
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        scales = weights.get_tensor(f"{prefix}.scales")
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+
+        try:
+            qweight = weights.get_packed_sharded(
+                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.scales", dim=1, block_sizes=block_sizes
+        )
+        scales = scales.to(dtype=weights.dtype)
+
+        if not self.sym:
+            qzeros = weights.get_packed_sharded(
+                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+            )
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            qweight = torch.cat(
+                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+        )
+
+        if not self.sym:
+            qzeros = torch.cat(
+                [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
+            )
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+            )
+
+        if not self.sym:
+            if self.desc_act or self.groupsize == -1:
+                qzeros = weights.get_tensor(f"{prefix}.qzeros")
+            else:
+                qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+
+        if self.desc_act or self.groupsize == -1:
+            scales = weights.get_tensor(f"{prefix}.scales")
+        else:
+            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+
+        sharded_in_features = weights.process_group.size() > 1
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=sharded_in_features,
+        )
+
+    def _get_gptq_params(self, weights: Weights):
+        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
+            self.bits = weights.get_tensor("gptq_bits").item()
+            self.groupsize = weights.get_tensor("gptq_groupsize").item()
+            self.desc_act = False
+            # `server quantize` used asymmetric quantization unconditionally
+            # before the `gptq_sym` setting tensor was added.
+            self.sym = (
+                weights.get_tensor("gptq_sym").item()
+                if weights._has_tensor("gptq_sym")
+                else False
+            )
+            self.quant_method = "gptq"
+
+
@dataclass
 class GPTQMarlinWeight(Weight):
    """
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -484,6 +484,9 @@ def get_model(
        )
    sliding_window = config_dict.get("sliding_window", -1)

+    if max_input_tokens is not None and max_input_tokens <= sliding_window:
+        sliding_window = -1
+
    if (
        (sliding_window is not None and sliding_window != -1)
        and not SUPPORTS_WINDOWING
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@ -291,17 +291,13 @@ class FlashCohereAttention(torch.nn.Module):

        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                key,
                value,
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -309,7 +305,6 @@ class FlashCohereAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@ -330,17 +330,13 @@ class DbrxAttention(torch.nn.Module):

        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -348,7 +344,6 @@ class DbrxAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@ -358,25 +358,20 @@ class DeepseekV2Attention(torch.nn.Module):

        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)

-        # Output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                key,
                value,
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
            )
        # Decode
        else:
-            paged_attention(
-                attn_output,
+            attn_output = paged_attention(
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@ -231,17 +231,13 @@ class FlashGemma2Attention(torch.nn.Module):

        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -252,7 +248,6 @@ class FlashGemma2Attention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -225,17 +225,13 @@ class FlashGemmaAttention(torch.nn.Module):

        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -244,7 +240,6 @@ class FlashGemmaAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@ -225,17 +225,13 @@ class FlashGPT2Attention(torch.nn.Module):

        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                key,
                value,
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -243,7 +239,6 @@ class FlashGPT2Attention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -213,17 +213,13 @@ class FlashLlamaAttention(torch.nn.Module):

        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -231,7 +227,6 @@ class FlashLlamaAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -212,17 +212,13 @@ class MistralAttention(torch.nn.Module):
            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
        )

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -231,7 +227,6 @@ class MistralAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@ -269,17 +269,13 @@ class MixtralAttention(torch.nn.Module):
            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
        )

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -288,7 +284,6 @@ class MixtralAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@ -158,17 +158,13 @@ class FlashNeoxAttention(torch.nn.Module):

        reshape_and_cache(qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(qkv[:, 0])
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                qkv[:, 0],
                qkv[:, 1],
                qkv[:, 2],
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -176,7 +172,6 @@ class FlashNeoxAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                qkv[:, 0],
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@ -188,16 +188,12 @@ class FlashPhiAttention(torch.nn.Module):
        # Reshape key and value and cache
        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -205,7 +201,6 @@ class FlashPhiAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -130,17 +130,13 @@ class Qwen2Attention(torch.nn.Module):
            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
        )

-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -149,7 +145,6 @@ class Qwen2Attention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -201,17 +201,13 @@ class FlashRWAttention(torch.nn.Module):

        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

-        # output
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -219,7 +215,6 @@ class FlashRWAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
@ -324,17 +319,13 @@ class FlashRWLargeAttention(torch.nn.Module):
            slots,
        )

-        # output
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            attention(
+            attn_output = attention(
                query,
                torch.select(kv, dim=2, index=0),
                torch.select(kv, dim=2, index=1),
-                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
@ -342,7 +333,6 @@ class FlashRWLargeAttention(torch.nn.Module):
        # Decode
        else:
            attn_output = paged_attention(
-                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
@ -392,8 +382,13 @@ class FlashRWLayer(nn.Module):

        prefix = f"{prefix}.h.{layer_id}"

+        # NOTE: Falcon 180B uses the ln_attn prefix
+        ln_prefix = "input_layernorm"
+        if config.num_hidden_layers == 80:
+            ln_prefix = "ln_attn"
+
        self.input_layernorm = FastLayerNorm.load(
-            prefix=f"{prefix}.input_layernorm",
+            prefix=f"{prefix}.{ln_prefix}",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
@ -483,7 +478,13 @@ class FlashRWLayer(nn.Module):
 class FlashRWLayerNorm(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
-        self.num_ln = config.num_ln_in_parallel_attn
+        # Falcon2 includes the number of layer norms in the config
+        # in the case no number of layer norms is provided, we default to 1
+        self.num_ln = getattr(config, "num_ln_in_parallel_attn", 1)
+
+        # Falcon 180B uses the ln_attn prefix and has 2 layer norms
+        if config.num_hidden_layers == 80:
+            self.num_ln = 2

        if self.num_ln == 1:
            self.input_ln = FastLayerNorm.load(
--- a/Show More
+++ b/Show More