Merge branch 'upgrade-outlines' into upgrade-outlines

2025-09-11 20:34:54 +00:00 · 2024-10-28 05:10:45 +01:00 · 2024-10-28 05:10:45 +01:00 · 44a9b2510d
commit 44a9b2510d
parent b04ffd48f1 b49cff3f07
80 changed files with 2719 additions and 1754 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -202,4 +202,5 @@ jobs:
          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          echo $DOCKER_IMAGE
          docker pull $DOCKER_IMAGE
          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,8 @@ router/tokenizer.json
 backends/v2/src/client/pb
 backends/v3/src/client/pb
 backends/client/src/v2/pb
 backends/client/src/v3/pb
 # ROCm auto-generated files
 *.hip
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -20,7 +20,7 @@ default-members = [
 resolver = "2"
 [workspace.package]
-version = "2.3.2-dev0"
+version = "2.4.1-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/11
+++ b/11
@ -161,15 +161,6 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build
 # Build FBGEMM CUDA kernels
 FROM kernel-builder AS fbgemm-builder
 WORKDIR /usr/src
 COPY server/Makefile-fbgemm Makefile
 RUN make build-fbgemm
 # Build vllm CUDA kernels
 FROM kernel-builder AS vllm-builder
@ -239,8 +230,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from fbgemm builder
 COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
--- a/Dockerfile.trtllm
+++ b/Dockerfile.trtllm
@ -1,23 +0,0 @@
 # All the tooling for CUDA
 FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS cuda-builder
 WORKDIR /usr/src/tgi/backends/trtllm
 RUN apt update && apt install -y cmake git git-lfs gcc g++ ninja-build libopenmpi-dev python3-dev python3-pip wget
 COPY . /usr/src/tgi
 RUN chmod +x scripts/install_tensorrt.sh && scripts/install_tensorrt.sh
 RUN cmake -G Ninja -B build -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include .
 RUN cmake --build build --parallel -t tgi_trtllm_backend_impl
 # All the tooling for Rust
 FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 # Include CUDA related libraries and tools to the Rust based image
 COPY --from=cuda-builder /usr/local/cuda /usr/local/cuda
 COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=cuda-builder /usr/src/tgi/backends/trtllm/build /usr/local/tgi/trtllm/build
 ENV PATH=/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH
 RUN apt update && apt install -y cmake git gcc g++ ninja-build libopenmpi3
--- a/backends/trtllm/Dockerfile
+++ b/backends/trtllm/Dockerfile
@ -10,7 +10,7 @@ COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    ninja-build \
    pkg-config \
    python3 \
    python3-dev \
    python3-setuptools \
    tar \
    wget
@ -42,7 +43,7 @@ RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILE
    mkdir /usr/src/mpi && \
    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
    cd /usr/src/mpi && \
-    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda && \
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
    make -j all && \
    make install && \
    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
@ -82,10 +83,16 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
    cd backends/trtllm && \
    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
-FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
 RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
    python3 -m pip install transformers tokenizers
 WORKDIR /usr/local/tgi/bin
-ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
 ENV TOKENIZERS_PARALLELISM=false
 ENV OMPI_MCA_plm_rsh_agent=""
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
--- a/README.md
+++ b/README.md
@ -83,7 +83,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model
 ```
 And then you can make requests like
@ -120,7 +120,7 @@ curl localhost:8080/v1/chat/completions \
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0-rocm --model-id $model` instead of the command above.
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@ -150,7 +150,7 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
-docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model
 ```
 ### A note on Shared Memory (shm)
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
@ -107,20 +107,22 @@ impl Client {
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_tokens: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
-            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+            let mut truncate = max_prefill_tokens - n_tokens;
            if let Some(max_input_tokens) = max_input_tokens {
                truncate = min(max_input_tokens, truncate);
            }
            let mut input_chunks = Vec::new();
-            input_chunks
+            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
            if n_tokens == 0 {
                input_chunks.push(
                    Chunk::Image(Image {
@ -136,7 +138,7 @@ impl Client {
            // been updated to support chunks.
            let mut inputs = String::new();
-            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
@ -145,6 +147,12 @@ impl Client {
                ));
            }
            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
                max_total_tokens - truncate
            } else {
                1
            };
            requests.push(Request {
                id: 0,
                inputs,
@ -175,7 +183,7 @@ impl Client {
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: max_total_tokens - truncate,
+                    max_new_tokens,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
@ -183,7 +191,7 @@ impl Client {
                top_n_tokens: 20,
                adapter_id: None,
            });
-            n_tokens += max_input_length;
+            n_tokens += truncate;
            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
@ -195,19 +203,23 @@ impl Client {
            id: 0,
            size: requests.len() as u32,
            requests,
-            max_tokens: max_input_length,
+            max_tokens: max_input_tokens.unwrap_or(0),
            max_blocks: 0,
        };
        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
-            max_input_length,
+            max_input_tokens,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
-        Ok(response.max_supported_total_tokens)
+        Ok((
            response.max_supported_total_tokens,
            response.max_input_tokens,
            response.max_total_tokens,
        ))
    }
    /// Generate one token for each request in the given batch
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
@ -101,11 +101,11 @@ impl ShardedClient {
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_length: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
@ -122,8 +122,16 @@ impl ShardedClient {
        let results = join_all(futures)
            .await
            .into_iter()
-            .collect::<Result<Vec<Option<u32>>>>()?;
+            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;
-        Ok(results.into_iter().flatten().min())
+
        // Take the minimum value
        // Different shards hold different parts of vocab, might yield
        // different available block size.
        let min = results
            .iter()
            .min()
            .expect("Expect at least 1 warmup result");
        Ok(*min)
    }
    /// Generate one token for each request in the given batch
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -1,5 +1,17 @@
 cmake_minimum_required(VERSION 3.20)
 if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
    find_program(CCACHE_EXECUTABLE "ccache")
    if (CCACHE_EXECUTABLE)
        message(STATUS "Using ccache")
        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
    endif ()
 endif ()
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
    cmake_policy(SET CMP0135 NEW)
 endif ()
 project(tgi-trtllm-backend VERSION 1.0.0)
 set(CMAKE_CXX_STANDARD 20)
@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include"
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
-find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
 #### External dependencies ####
 include(cmake/fmt.cmake)
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@ -10,16 +10,17 @@ async-trait = "0.1"
 async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
 hashbrown = "0.14"
 hf-hub = { workspace = true }
 log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
-tokenizers = { version = "0.19", features = ["hf-hub"] }
+tokenizers = { workspace = true }
-tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.15"
-thiserror = "1.0.62"
+thiserror = "1.0.63"
 tracing = "0.1"
-tracing-opentelemetry = "0.24"
+tracing-opentelemetry = "0.25"
 tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
 parking_lot = "0.12"
 [build-dependencies]
 cmake = "0.1"
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf};
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
-const CUDA_REQUIRED_VERSION: &str = "12.5";
+const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
    // Build the backend implementation through CMake
    let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
    let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
-    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");
    let mut install_path = PathBuf::from(install_path);
    if !install_path.is_absolute() {
@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
    (PathBuf::from(install_path), deps_folder)
 }
-fn build_ffi_layer(deps_folder: &PathBuf) {
+fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    let ndebug = match is_debug {
        true => "1",
        false => "0",
    };
    CFG.include_prefix = "backends/trtllm";
    cxx_build::bridge("src/lib.rs")
        .static_flag(true)
@ -93,9 +98,14 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
        .include("/usr/local/tensorrt/include")
        .file("src/ffi.cpp")
        .std("c++20")
        .define("NDEBUG", ndebug)
        .compile("tgi_trtllm_backend");
    println!("cargo:rerun-if-changed=CMakeLists.txt");
    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
    println!("cargo:rerun-if-changed=cmake/json.cmake");
    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
    println!("cargo:rerun-if-changed=include/backend.h");
    println!("cargo:rerun-if-changed=lib/backend.cpp");
    println!("cargo:rerun-if-changed=include/ffi.h");
@ -115,7 +125,7 @@ fn main() {
    let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
    // Build the FFI layer calling the backend above
-    build_ffi_layer(&deps_folder);
+    build_ffi_layer(&deps_folder, is_debug);
    // Emit linkage search path
    probe!("ompi", MPI_REQUIRED_VERSION);
--- a/backends/trtllm/cmake/fmt.cmake
+++ b/backends/trtllm/cmake/fmt.cmake
@ -1,6 +1,6 @@
 FetchContent_Declare(
        fmt
-        GIT_REPOSITORY https://github.com/fmtlib/fmt
+        DOWNLOAD_EXTRACT_TIMESTAMP
-        GIT_TAG 11.0.1
+        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
 )
 FetchContent_MakeAvailable(fmt)
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@ -1,5 +1,6 @@
 fetchcontent_declare(
        json
        DOWNLOAD_EXTRACT_TIMESTAMP
        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
 )
 fetchcontent_makeavailable(json)
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@ -11,7 +11,7 @@ endif ()
 fetchcontent_declare(
        spdlog
-        GIT_REPOSITORY https://github.com/gabime/spdlog.git
+        DOWNLOAD_EXTRACT_TIMESTAMP
-        GIT_TAG v1.14.1
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@ -23,8 +23,9 @@ endif ()
 fetchcontent_declare(
        trtllm
        GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
-        GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
+        GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
        GIT_SHALLOW FALSE
        DOWNLOAD_EXTRACT_TIMESTAMP
 )
 fetchcontent_makeavailable(trtllm)
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -5,6 +5,7 @@
 #ifndef TGI_TRTLLM_BACKEND_H
 #define TGI_TRTLLM_BACKEND_H
 #include <array>
 #include <cmath>
 #include <filesystem>
 #include <span>
@ -19,16 +20,33 @@
 using json = nlohmann::json;
 namespace tle = tensorrt_llm::executor;
 #define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
 namespace huggingface::tgi::backends {
    using RequestId = tle::IdType;
    using TokenId = tle::TokenIdType;
    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
            "Submitting inference [{}] to the executor ({:d} already in-flight)");
    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
            "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
    /**
     * Initialize all the components required by TRTLLM.
     * It is required to call this function before attempting to load any engine
     */
    void InitializeBackend();
    /**
     * Initialize logging mechanism
     */
    void InitializeLogging();
    /**
     *
     * @param config TensorRT-LLM configuration object
@ -37,6 +55,14 @@ namespace huggingface::tgi::backends {
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
    /**
     *
     * @param worldSize
     * @param workerPath
     * @return
     */
    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
    /**
     * Get the sampling configuration from the parameters provided by TGI
     * @param topK
@ -54,7 +80,15 @@ namespace huggingface::tgi::backends {
            float_t repetition_penalty,
            float_t frequency_penalty,
            uint64_t seed
-    );
+    ) noexcept;
    /**
     * Attempt to retrieve the
     * @param generationConfigPath
     * @return
     */
    std::optional<std::list<std::vector<TokenId>>>
    GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
    /**
     *
@ -64,18 +98,16 @@ namespace huggingface::tgi::backends {
        const json config;
        tle::Executor executor;
        /** Frequently accessed variables cached here **/
        uint32_t maxNumTokens;
        std::list<std::vector<TokenId>> stopWords;
    public:
        explicit TensorRtLlmBackend(
                const std::filesystem::path &engineFolder,
                const std::filesystem::path &executorWorker
        );
        /**
         * Indicate if the backend is ready to accept incoming request
         * @return true if ready, false otherwise
         */
        [[nodiscard]] bool IsReady() const;
        /**
         * Query the executor for the number of token available for pulling
         * @return
@ -88,32 +120,23 @@ namespace huggingface::tgi::backends {
         * @param topK
         * @param topP
         * @param temperature
-         * @param repetition_penalty
+         * @param repetitionPenalty
-         * @param frequency_penalty
+         * @param frequencyPenalty
         * @param seed
         * @return Request id related to this generation for reference
         */
        [[nodiscard]] RequestId Submit(
                const std::vector<TokenId> &tokens,
                uint32_t maxNewTokens,
                int32_t topK,
                float_t topP,
                float_t temperature,
-                float_t repetition_penalty,
+                float_t repetitionPenalty,
-                float_t frequency_penalty,
+                float_t frequencyPenalty,
                uint64_t seed
        );
-        /**
+        [[nodiscard]] std::vector<tle::Response> PullNewTokens();
         *
         * @param requestId The request id to poll the generation results
         * @return
         */
        std::vector<tle::Response> Poll(RequestId requestId);
        /**
         * Stop the underlying executor
         */
        void Shutdown();
    };
 }
--- a/backends/trtllm/include/ffi.h
+++ b/backends/trtllm/include/ffi.h
@ -5,20 +5,31 @@
 #ifndef TGI_TRTLLM_BACKEND_FFI_H
 #define TGI_TRTLLM_BACKEND_FFI_H
 #include <cmath>
 #include <cstddef>
 #include <memory>
 #include "backend.h"
 namespace huggingface::tgi::backends {
    class TensorRtLlmBackendImpl;
 }
 // Template to support returning error from TllmException back to Rust in a Result<>
 #include <tensorrt_llm/common/tllmException.h>
 namespace rust::behavior {
    template<typename Try, typename Fail>
    static void trycatch(Try &&func, Fail &&fail) noexcept try {
        func();
    } catch (tensorrt_llm::common::TllmException &e) {
        fail(e.what());
    }
 }
 #include "backends/trtllm/src/lib.rs.h"
 namespace huggingface::tgi::backends {
 //    struct GenerationContext;
    class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
    public:
        /***
@ -28,15 +39,10 @@ namespace huggingface::tgi::backends {
         */
        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
        /***
         *
         * @return
         */
        bool IsReady() const;
        /***
         *
         * @param tokens
         * @param maxNewTokens
         * @param topK
         * @param topP
         * @param temperature
@ -47,21 +53,15 @@ namespace huggingface::tgi::backends {
         */
        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
        uint64_t
-        Submit(rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature,
+        Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
               int32_t topK, float_t topP, float_t temperature,
               float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
        /***
         *
         * @param requestId
         * @param ctx
         * @param callback
         * @return
         */
-        size_t StreamTokens(
+        std::unique_ptr<std::vector<GenerationStep>> PullTokens();
                const RequestId requestId,
                huggingface::tgi::backends::GenerationContext *ctx,
                rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
                              huggingface::tgi::backends::GenerationStep)> callback);
    };
    /***
--- a/backends/trtllm/include/hardware.h
+++ b/backends/trtllm/include/hardware.h
@ -14,7 +14,7 @@
 namespace huggingface::hardware::cuda {
 #define AMPERE_SM_MAJOR 8
-#define HOPPER_SM_MAJOR 8
+#define HOPPER_SM_MAJOR 9
    /**
     * Store information about the version of the CUDA Compute Capabilities detected on the device
@ -23,9 +23,9 @@ namespace huggingface::hardware::cuda {
        int32_t major;
        int32_t minor;
-        [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
+        [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
-        [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
+        [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
    };
    CudaComputeCapabilities GetCudaComputeCapabilities() {
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -1,3 +1,4 @@
 #include <cstdlib>
 #include <fstream>
 #include <fmt/ranges.h>
@ -7,11 +8,33 @@
 #include "backend.h"
 #include "hardware.h"
 void huggingface::tgi::backends::InitializeLogging() {
 #ifdef NDEBUG
    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
            return std::tolower(c);
        });
        if (log_level == "debug")
            spdlog::set_level(spdlog::level::debug);
        else
            spdlog::set_level(spdlog::level::info);
    }
 #else
    spdlog::set_level(spdlog::level::debug);
 #endif
 }
 void huggingface::tgi::backends::InitializeBackend() {
    SPDLOG_INFO("Initializing Backend...");
    nvmlInit_v2();
    initTrtLlmPlugins();
    InitializeLogging();
    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
    if (numGpus.has_value()) {
        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
@ -20,47 +43,49 @@ void huggingface::tgi::backends::InitializeBackend() {
    }
 }
 [[nodiscard]]
 tle::ParallelConfig
 huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
    auto mode = tle::CommunicationMode::kLEADER;
    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
    if (worldSize > 1) {
        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        mode = tle::CommunicationMode::kORCHESTRATOR;
        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
    } else {
        SPDLOG_INFO("Detected single engine deployment, using leader mode");
    }
    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
 }
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
-    tle::ExecutorConfig execConfig(1);
+    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
    // Retrieve the compute capabilities to enable some options at runtime
    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
-    if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
+    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
-        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+    execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kLEADER,
                std::nullopt,
                std::nullopt,
                std::nullopt
        ));
    } else { // Multiple engines -> using orchestrator mode (MPI involved)
        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kORCHESTRATOR,
                std::nullopt,
                std::nullopt,
                tle::OrchestratorConfig(true, workerPath, nullptr, true)
        ));
    }
    // Define some configuration variables
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
+    execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
    execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
    return execConfig;
 }
 tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
-        uint32_t topK,
+        const uint32_t topK,
-        float_t topP,
+        const float_t topP,
-        float_t temperature,
+        const float_t temperature,
-        float_t repetition_penalty,
+        const float_t repetition_penalty,
-        float_t frequency_penalty,
+        const float_t frequency_penalty,
-        uint64_t seed) {
+        const uint64_t seed) noexcept {
    return tle::SamplingConfig(
            1,  // TGI only use a single beam
            topK,
@ -78,69 +103,101 @@ tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
    );
 }
 std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
 huggingface::tgi::backends::GetStopWordsFromConfig(
        const std::filesystem::path &generationConfigPath) noexcept {
    if (exists(generationConfigPath)) {
        const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
        if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
            SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
            std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
            const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
                return {tokenIdObj.template get<tle::TokenIdType>()};
            };
            std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
            return stopWords;
        } else {
            SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
        }
    } else {
        SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
    }
    return std::nullopt;
 }
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
 ) :
        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
-        executor(
+        executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
-                enginesFolder,
+                 GetExecutorConfig(config, executorWorker.string())) {
                tensorrt_llm::executor::ModelType::kDECODER_ONLY,
                GetExecutorConfig(config, executorWorker.string()
                )) {
    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string &>());
 }
-bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
-    return executor.canEnqueueRequests();
+
    // Ensure we have enough GPUs on the system
    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
    if (numGpus < worldSize) {
        SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
        // todo : raise exception to catch on rust side
    }
    // Cache variables
    maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
    // Attempt to discover stopWords from the generation_config.json
    const auto generationConfigPath = enginesFolder / "generation_config.json";
    stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
 }
 [[nodiscard("Returned number of requests needs to be consumed")]]
 size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
 #ifdef NDEBUG
    return executor.getNumResponsesReady();
 #else
    const auto numResponses = executor.getNumResponsesReady();
    if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
    return numResponses;
 #endif
 }
 [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const std::vector<tle::TokenIdType> &tokens,
        const uint32_t maxNewTokens,
        const int32_t topK,
        const float_t topP,
        const float_t temperature,
-        const float_t repetition_penalty,
+        const float_t repetitionPenalty,
-        const float_t frequency_penalty,
+        const float_t frequencyPenalty,
        const uint64_t seed
 ) {
-#ifdef NDEBUG
+    const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
-    SPDLOG_DEBUG(
+#ifndef NDEBUG
-            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
+    {
-            tokens.size(),
+        const auto &iterations = executor.getLatestIterationStats();
-            executor.getLatestIterationStats().back().numActiveRequests
+        const auto &lastIteration = iterations.front();
-    );
+
-#else
+        SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
-    SPDLOG_DEBUG(
+        SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
-            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
+        SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
-            fmt::join(tokens, ", "),
+    }
            executor.getLatestIterationStats().front().numActiveRequests
    );
 #endif
-    const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<size_t>();
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
-    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+    // Build the request
-    const auto output = tle::OutputConfig(true, false, false, true, false);
+    auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
-    return executor.enqueueRequest(
+    request.setStopWords(stopWords);
-            tle::Request{tokens, maxNewTokens, true, sampling, output});
+
    // Submit to the executor for batching
    return executor.enqueueRequest(request);
 }
-[[nodiscard("Generated tokens result must be used")]]
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
-std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
+    return executor.awaitResponses();
    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
    return executor.awaitResponses(requestId);
 }
 void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() {
    SPDLOG_INFO("Shutting down executor");
    executor.shutdown();
 }
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@ -2,12 +2,13 @@
 set -ex
-TRT_VER="10.2.0.19"
+TRT_VER_BASE="10.4.0"
-CUDA_VER="12.5"
+TRT_VER_FULL="${TRT_VER_BASE}.26"
-CUDNN_VER="9.2.1.18-1"
+CUDA_VER="12.6"
-NCCL_VER="2.22.3-1+cuda12.5"
+CUDNN_VER="9.5.0.50-1"
-CUBLAS_VER="12.5.3.2-1"
+NCCL_VER="2.22.3-1+cuda12.6"
-NVRTC_VER="12.5.82-1"
+CUBLAS_VER="12.6.3.3-1"
 NVRTC_VER="12.6.77-1"
 for i in "$@"; do
    case $i in
@ -32,8 +33,9 @@ install_ubuntu_requirements() {
    ARCH=$(uname -m)
    if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
-    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb
+    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb
-    dpkg -i cuda-keyring_1.0-1_all.deb
+    dpkg -i cuda-keyring_1.1-1_all.deb
    rm /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
    apt-get update
    if [[ $(apt list --installed | grep libcudnn9) ]]; then
@ -71,7 +73,7 @@ install_centos_requirements() {
 install_tensorrt() {
    #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
    #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
-    TRT_CUDA_VERSION="12.5"
+    TRT_CUDA_VERSION="12.6"
    if [ -z "$RELEASE_URL_TRT" ];then
        ARCH=${TRT_TARGETARCH}
@ -79,12 +81,12 @@ install_tensorrt() {
        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
-        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
+        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
-        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_BASE}/tars/TensorRT-${TRT_VER_FULL}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
    fi
    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
    tar -xf /tmp/TensorRT.tar -C /usr/local/
-    mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt
+    mv /usr/local/TensorRT-${TRT_VER_FULL} /usr/local/tensorrt
    # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
    rm -rf /tmp/TensorRT.tar
 }
--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@ -1,330 +0,0 @@
 use std::future::Future;
 use std::path::Path;
 use std::pin::{pin, Pin};
 use std::str::FromStr;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, OnceLock};
 use std::task::{Context, Poll};
 use std::time::Duration;
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use log::{error, warn};
 use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
 use tokio::time::{sleep, Instant};
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tokio_stream::{Stream, StreamExt};
 use tracing::{instrument, span, Level};
 // use tokio::sync::RwLock;
 use parking_lot::RwLock;
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidationError::UnsupportedModality;
 use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};
 use text_generation_router::{FinishReason, Token};
 use crate::errors::TensorRtLlmBackendError;
 use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
 // Value used to poll the state of the generation stream
 static POLLING_INTERVAL_US: OnceLock<u64> = OnceLock::new();
 type InferResult<T> = Result<T, InferError>;
 pub(crate) struct Generation {
    executor: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
    done: Arc<AtomicBool>,
 }
 /// Holds the user provided input to be executed along with a channel allowing
 /// to bubble up all the generated tokens for that tokens the to end stream.
 pub struct GenerationContext {
    sender: UnboundedSender<InferResult<InferStreamResponse>>,
    tokenizer: Arc<Tokenizer>,
    tokens: Vec<u32>,
    done: Arc<AtomicBool>,
    queued: Instant,
    start: Option<Instant>,
 }
 impl Stream for Generation {
    type Item = usize;
    fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
        let interval = POLLING_INTERVAL_US.get_or_init(|| {
            u64::from_str(option_env!("TRTLLM_BACKEND_POLLING_INTERVAL_US").unwrap_or("100"))
                .expect("Invalid value provided for envvar POLLING_INTERVAL_US")
        });
        if !self.done.load(Ordering::Relaxed) {
            let backend = pin!(self.executor.read());
            let status = match backend.poll(ctx) {
                Poll::Ready(executor_r) => {
                    let ready = executor_r.num_responses_ready();
                    if ready == 0 {
                        Poll::Pending
                    } else {
                        Poll::Ready(Some(ready))
                    }
                }
                Poll::Pending => Poll::Pending,
            };
            let waker = ctx.waker().clone();
            tokio::spawn(async {
                sleep(Duration::from_micros(*interval)).await;
                waker.wake();
            });
            status
        } else {
            Poll::Ready(None) // end of stream
        }
    }
    fn size_hint(&self) -> (usize, Option<usize>) {
        (1, None)
    }
 }
 unsafe impl Send for TensorRtLlmBackendImpl {}
 unsafe impl Sync for TensorRtLlmBackendImpl {}
 /// Implements the logic to execute generation with TensorRT-LLM executor API in background
 pub struct TensorRtLlmBackend {
    tokenizer: Arc<Tokenizer>,
    // Backing the backend behind a RwLock to allow concurrent read access to retrieve
    // the number of available tokens (read only) in the Generation stream
    backend: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
 }
 impl TensorRtLlmBackend {
    pub fn new<P: AsRef<Path> + Send + 'static, PP: AsRef<Path> + Send + 'static>(
        tokenizer: Tokenizer,
        engine_folder: P,
        executor_worker_path: PP,
    ) -> Result<Self, TensorRtLlmBackendError> {
        Ok(TensorRtLlmBackend {
            tokenizer: Arc::new(tokenizer),
            backend: Arc::new(RwLock::new(create_tensorrt_llm_backend(
                engine_folder.as_ref().to_str().unwrap(),
                executor_worker_path.as_ref().to_str().unwrap(),
            ))),
        })
    }
    fn validate(request: &ValidGenerateRequest) -> InferResult<&String> {
        if request.top_n_tokens > 1 {
            return Err(InferError::ValidationError(
                ValidationError::TopNTokensDisabled,
            ));
        }
        // TODO: Is it really needed? How can it be validated before?
        if request.parameters.grammar.is_some() {
            return Err(InferError::ValidationError(ValidationError::Grammar));
        }
        match request.inputs.len() {
            0 => Err(InferError::ValidationError(ValidationError::EmptyInput)),
            2.. => Err(InferError::GenerationError(
                "TensorRT-LLM backend don't support multi-chunk".into(),
            )),
            1 => match request.inputs.first().expect("Single item-chunk") {
                Chunk::Text(text) => Ok(text),
                Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))),
            },
        }
    }
    fn generate(
        &self,
        sender: UnboundedSender<InferResult<InferStreamResponse>>,
        tokens: Vec<u32>,
        top_k: u32,
        top_p: f32,
        temperature: f32,
        repetition_penalty: f32,
        frequency_penalty: f32,
        seed: u64,
    ) {
        let tokenizer = Arc::clone(&self.tokenizer);
        let executor = Arc::clone(&self.backend);
        // Let's push this in async context
        tokio::spawn(async move {
            // Define the generation state
            let mut generation = Generation {
                executor: executor.clone(),
                done: Arc::new(AtomicBool::new(false)),
            };
            // Define the context over the generation
            // TODO(asap): Do we really need so many shared-ownership?
            let ctx = Box::new(GenerationContext {
                sender: sender.clone(),
                tokenizer,
                tokens: vec![],
                done: Arc::clone(&generation.done),
                start: None,
                queued: Instant::now(),
            });
            // We are leaking the context on-purpose to avoid the box being dropped while there are
            // still computation ongoing
            // TODO(asap): Can we achieve the same with an Arc<Box<T>> without the need to go unsafe?
            let ctx_ = Box::leak(ctx);
            // Submit the request to the batcher
            let request_id = span!(Level::DEBUG, "submit")
                .in_scope(|| async {
                    let mut handle = executor.write().await;
                    let request_id = handle.pin_mut().submit(
                        &tokens,
                        top_k as i32,
                        top_p,
                        temperature,
                        repetition_penalty,
                        frequency_penalty,
                        seed,
                    );
                    request_id
                })
                .await;
            while let Some(_) = generation.next().await {
                let mut executor_w = executor.write().await;
                let executor = executor_w.pin_mut();
                span!(Level::DEBUG, "decode")
                    .in_scope(|| async {
                        unsafe {
                            executor.stream_tokens(
                                request_id,
                                ctx_,
                                |ctx: *mut GenerationContext, step: GenerationStep| {
                                    let inner_ctx = &mut *ctx;
                                    // Update the timestamp at which the request started effectively
                                    // Can be a bit off, would need to be before the callback, let's see
                                    inner_ctx.start.get_or_insert(Instant::now());
                                    inner_ctx.done.store(step.is_final, Ordering::Relaxed);
                                    // Ensure we are not running into errors
                                    let parcel = if !step.has_error {
                                        // Insert the latest generated token to the tracker
                                        inner_ctx.tokens.push(step.token_id);
                                        // Decode the token
                                        let text = inner_ctx
                                            .tokenizer
                                            .decode(&[step.token_id], true)
                                            .expect("Failed to decode token");
                                        let special = inner_ctx
                                            .tokenizer
                                            .get_added_vocabulary()
                                            .is_special_token(&text);
                                        // Create the structure holding the token
                                        let token = Token {
                                            id: step.token_id,
                                            text,
                                            logprob: step.log_prob,
                                            special,
                                        };
                                        if step.is_final {
                                            let generated_text = inner_ctx
                                                .tokenizer
                                                .decode(&inner_ctx.tokens, true)
                                                .expect("Failed to decode generated_tokens");
                                            Ok(InferStreamResponse::End {
                                                token,
                                                top_tokens: vec![],
                                                generated_text: GeneratedText {
                                                    text: generated_text,
                                                    generated_tokens: inner_ctx.tokens.len() as u32,
                                                    finish_reason: FinishReason::EndOfSequenceToken,
                                                    seed: None,
                                                },
                                                start: inner_ctx.start.unwrap_or(Instant::now()),
                                                queued: inner_ctx.queued,
                                            })
                                        } else {
                                            Ok(InferStreamResponse::Intermediate {
                                                token,
                                                top_tokens: vec![],
                                            })
                                        }
                                    } else {
                                        error!("Error caught while decoding: {}", &step.error_msg);
                                        Err(InferError::GenerationError(step.error_msg))
                                    };
                                    // Send the parcel to the client
                                    inner_ctx
                                        .sender
                                        .send(parcel)
                                        .expect("Failed to sent msg through the channel");
                                },
                            );
                        }
                    })
                    .await;
            }
            // "Properly" free the shared context...
            // TODO: clean that piece of sh** asap
            unsafe {
                let _ = Box::from_raw(ctx_);
            }
        });
    }
 }
 #[async_trait]
 impl Backend for TensorRtLlmBackend {
    #[instrument(skip_all)]
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> InferResult<UnboundedReceiverStream<InferResult<InferStreamResponse>>> {
        // Let's add a few more validation
        let input = TensorRtLlmBackend::validate(&request)?;
        // Channel to stream the generated token as they come from the worker thread back to the transport layer
        let (sender, receiver) = unbounded_channel();
        // Unpack parameters
        let params = &request.parameters;
        // Preprocess the inputs to send to TRTLLM backend
        let encoding = self
            .tokenizer
            .encode(input.as_str(), true)
            .map_err(|e| InferError::GenerationError(e.to_string()))?;
        // Generate the response
        self.generate(
            sender,
            Vec::from(encoding.get_ids()),
            params.top_k,
            params.top_p,
            params.temperature,
            params.repetition_penalty,
            params.frequency_penalty,
            params.seed,
        );
        Ok(UnboundedReceiverStream::new(receiver))
    }
    async fn health(&self, _current_health: bool) -> bool {
        true
    }
 }
--- a/backends/trtllm/src/errors.rs
+++ b/backends/trtllm/src/errors.rs
@ -1,9 +1,16 @@
 use std::path::PathBuf;
 use thiserror::Error;
 use text_generation_router::server;
 #[derive(Debug, Error)]
 pub enum TensorRtLlmBackendError {
    #[error("Provided engine folder {0} doesn't exist")]
    EngineFolderDoesntExists(PathBuf),
    #[error("Provided executorWorker binary path {0} doesn't exist")]
    ExecutorWorkerNotFound(PathBuf),
    #[error("TensorRT-LLM Runtime error: {0}")]
    Runtime(String),
    #[error("Tokenizer error: {0}")]
    Tokenizer(String),
    #[error("Argument validation error: {0}")]
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -3,11 +3,13 @@
 //
 #pragma once
-#include <cmath>
+#include <algorithm>
 #include <exception>
 #include <filesystem>
 #include <functional>
 #include <limits>
 #include <iterator>
 #include <ranges>
 #include <vector>
 #include <spdlog/spdlog.h>
@ -20,61 +22,64 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
 ) : TensorRtLlmBackend(engineFolder, executorWorker) {}
 bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const {
    return TensorRtLlmBackend::IsReady();
 }
 uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
-        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty,
+        rust::Slice<const uint32_t> tokens,
-        float_t frequency_penalty, uint64_t seed) {
+        uint32_t maxNewTokens,
        int32_t topK,
        float_t topP,
        float_t temperature,
        float_t repetition_penalty,
        float_t frequency_penalty,
        uint64_t seed) {
    // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
+    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
    return TensorRtLlmBackend::Submit(
-            std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
 }
-size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
+std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
-        const uint64_t requestId,
+huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
-        huggingface::tgi::backends::GenerationContext *ctx,
+    const auto responses = TensorRtLlmBackend::PullNewTokens();
        rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
                      huggingface::tgi::backends::GenerationStep)> callback) {
-    size_t numTokens = 0;
+    auto steps = std::make_unique<std::vector<GenerationStep>>();
-    for (const auto &item: Poll(requestId)) {
+    steps->reserve(responses.size());
        GenerationStep step;
        if (!item.hasError()) {
            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
            const auto decoded = item.getResult();
-            const auto token = decoded.outputTokenIds[0][0];
+#ifndef NDEBUG
-            const auto isFinal = decoded.isFinal;
+    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
-            const auto logProb = decoded.logProbs.value()[0][0];
+#endif
-            ++numTokens;
+    // Transform tle::Response to GenerationStep
-
+    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
-            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
+        const auto reqId = r.getRequestId();
-            step = huggingface::tgi::backends::GenerationStep{
+        if (!r.hasError()) {
-                    static_cast<uint32_t>(token), logProb, isFinal, false, std::move(std::string())
+            const auto result = r.getResult();
            return GenerationStep{
                    reqId,
                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
                    result.logProbs.value()[0][0],
                    result.isFinal,
                    false,
                    std::string()
            };
            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
        } else {
-            // TODO : Return rest::Result with error
+            return GenerationStep{
-            const auto what = item.getErrorMsg();
+                    reqId,
-            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", what);
+                    0,
-            step = huggingface::tgi::backends::GenerationStep{
+                    0.0,
-                    std::numeric_limits<uint32_t>::max(), 0.0, true, true, std::move(what)
+                    true,
                    true,
                    std::move(r.getErrorMsg())
            };
        }
    });
-        callback(std::move(ctx), std::move(step));
+    return steps;
    }
    return numTokens;
 }
 std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
 huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
    SPDLOG_INFO("Creating TensorRT-LLM Backend");
    // Unconditionally call this to initialize and discover TRTLLM plugins
    InitializeBackend();
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -1,14 +1,16 @@
-pub use backend::{GenerationContext, TensorRtLlmBackend};
+pub use looper::TensorRtLlmBackendV2;
 mod backend;
 pub mod errors;
 mod looper;
 mod utils;
 #[cxx::bridge(namespace = "huggingface::tgi::backends")]
 mod ffi {
    /// Struct used as shared type between rust and C++ to represent the result
    /// of a single decoding iteration
    #[derive(Debug, Clone)]
    pub struct GenerationStep {
        request_id: u64,
        token_id: u32,
        log_prob: f32,
        is_final: bool,
@ -16,10 +18,6 @@ mod ffi {
        error_msg: String,
    }
    extern "Rust" {
        type GenerationContext;
    }
    unsafe extern "C++" {
        include!("backends/trtllm/src/ffi.cpp");
@ -44,10 +42,7 @@ mod ffi {
        fn CreateTensorRtLlmBackend(
            engine_folder: &str,
            executor_worker: &str,
-        ) -> UniquePtr<TensorRtLlmBackendImpl>;
+        ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
        // #[rust_name = "is_ready"]
        // fn IsReady(self: &TensorRtLlmBackendImpl) -> bool;
        #[rust_name = "num_responses_ready"]
        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
@ -56,23 +51,18 @@ mod ffi {
        fn Submit(
            self: Pin<&mut TensorRtLlmBackendImpl>,
            tokens: &[u32],
            max_new_tokens: u32,
            top_k: i32,
            top_p: f32,
            temperature: f32,
            repetition_penalty: f32,
            frequency_penalty: f32,
            seed: u64,
-        ) -> u64;
+        ) -> Result<u64>;
-        #[rust_name = "stream_tokens"]
+        #[rust_name = "pull_tokens"]
-        unsafe fn StreamTokens(
+        fn PullTokens(
            self: Pin<&mut TensorRtLlmBackendImpl>,
-            request_id: u64,
+        ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
            ctx: *mut GenerationContext,
            cb: unsafe fn(*mut GenerationContext, GenerationStep),
        ) -> usize;
        // #[rust_name = "shutdown"]
        // fn Shutdown(self: Pin<&mut TensorRtLlmBackendImpl>);
    }
 }
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@ -0,0 +1,382 @@
 use std::hint;
 use std::ops::Deref;
 use std::path::Path;
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use hashbrown::HashMap;
 use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
 use tokio::sync::TryAcquireError;
 use tokio::task::{spawn_blocking, JoinHandle};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, warn};
 use text_generation_router::infer::InferError::{GenerationError, ValidationError};
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidationError::{
    EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
 };
 use text_generation_router::validation::{Chunk, ValidGenerateRequest};
 use text_generation_router::{FinishReason, Token};
 use crate::errors::TensorRtLlmBackendError;
 use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
 use crate::utils::first_line;
 type InferResult<T> = Result<T, InferError>;
 /// Wrap the requests along with the channel used to stream back to the client the decoded tokens
 struct GenerationContext {
    request: ValidGenerateRequest,
    start: Option<Instant>,
    queued: Instant,
    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
 }
 #[derive(Debug, Copy, Clone)]
 struct DecodedToken {
    id: u32,
    log_prob: f32,
    is_final: bool,
 }
 impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
    type Error = InferError;
    fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
        if !step.has_error {
            Ok(Self {
                id: step.token_id,
                log_prob: step.log_prob,
                is_final: step.is_final,
            })
        } else {
            Err(GenerationError(step.error_msg.clone()))
        }
    }
 }
 /// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
 struct DecodedTokenContext {
    token: DecodedToken,
    start: Option<Instant>,
    queued: Instant,
    channel: UnboundedSender<InferResult<InferStreamResponse>>,
 }
 fn executor_status_looper(
    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
    max_inflight_requests: usize,
    mut waiting_requests: UnboundedReceiver<GenerationContext>,
    post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
 ) {
    // Track the tuple (request_id, stream) for each request
    let mut in_flights =
        HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
    // TODO: Does it need a spin-loop?
    'scheduler: loop {
        // Is there any request pending to be scheduled?
        let awaiting_requests = waiting_requests.len();
        for _ in 0..awaiting_requests {
            // Retrieve all the requests
            if let Some(mut ctx) = waiting_requests.blocking_recv() {
                // Submit all the request to the executor and move the context to the in-flight tracker
                let request = &ctx.request;
                let generation_params = &request.parameters;
                let stopping_params = &request.stopping_parameters;
                let input_ids = request.input_ids.as_deref();
                // Submit to the TensorRT-LLM executor for scheduling
                match backend.pin_mut().submit(
                    &input_ids.unwrap(), // This is checked beforehand in validate()
                    stopping_params.max_new_tokens,
                    generation_params.top_k as i32,
                    generation_params.top_p,
                    generation_params.temperature,
                    generation_params.repetition_penalty,
                    generation_params.frequency_penalty,
                    generation_params.seed,
                ) {
                    Ok(request_id) => {
                        // Insert the context linked to the generated request id in the tracker
                        debug!("[in-flight] Added {}", request_id);
                        ctx.start = Some(Instant::now());
                        in_flights.insert(request_id, ctx);
                    }
                    Err(e) => {
                        // Return to the caller
                        let what = e.to_string();
                        error!(error = what.as_str(), "Failed to schedule request");
                        let err = Err(InferError::Overloaded(TryAcquireError::NoPermits));
                        if let Err(_) = ctx.streamer.send(err) {
                            error!("Failed to send back error to the client");
                        }
                    }
                };
            }
        }
        if backend.num_responses_ready() > 0 {
            match backend.pin_mut().pull_tokens() {
                Ok(responses) => {
                    // Iterate through all the decoded token
                    for step in responses.deref() {
                        if let Some(ctx) = in_flights.get(&step.request_id) {
                            // Remove from tracked requests
                            let parcel =
                                DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
                                    token: dt,
                                    start: ctx.start,
                                    queued: ctx.queued,
                                    channel: ctx.streamer.clone(),
                                });
                            // Submit the work to p:the post_processor
                            let posted = post_processor_sender.send((step.request_id, parcel));
                            if posted.is_err() || step.is_final {
                                debug!("Removing {}", step.request_id);
                                let _ = in_flights.remove(&step.request_id);
                            }
                        } else {
                            warn!("Untracked request {}", step.request_id,);
                        }
                    }
                }
                Err(ref err) => {
                    error!("Failed to get responses from the executor: {}.", err.what());
                    break 'scheduler;
                }
            }
        }
        // Hint the CPU we are spin-locking
        hint::spin_loop();
    }
 }
 fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
    tokenizer: Tokenizer,
    max_inflight_requests: usize,
    mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
 ) {
    let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
    'post_processor: loop {
        if decoded_tokens.is_closed() {
            warn!("Post processor IPC is closed, loop will exit now.");
            break 'post_processor;
        }
        if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
            match decoded {
                Ok(ctx) => {
                    states
                        .entry(request_id)
                        .and_modify(|s| s.push(*&ctx.token.id))
                        .or_insert_with(|| {
                            let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
                            state.push(*&ctx.token.id);
                            state
                        });
                    let out = match tokenizer.decode(&[ctx.token.id], false) {
                        Ok(text) => {
                            let is_special =
                                tokenizer.get_added_vocabulary().is_special_token(&text);
                            let token = Token {
                                id: ctx.token.id,
                                text,
                                logprob: ctx.token.log_prob,
                                special: is_special,
                            };
                            let out = if !ctx.token.is_final {
                                InferStreamResponse::Intermediate {
                                    token,
                                    top_tokens: vec![],
                                }
                            } else {
                                let tokens = states.remove(&request_id).unwrap();
                                let text = tokenizer.decode(&tokens, true);
                                let generated_text = GeneratedText {
                                    text: text.unwrap(),
                                    generated_tokens: tokens.len() as u32,
                                    finish_reason: FinishReason::EndOfSequenceToken,
                                    seed: None,
                                };
                                InferStreamResponse::End {
                                    token,
                                    top_tokens: vec![],
                                    generated_text,
                                    start: ctx.start.unwrap(),
                                    queued: ctx.queued,
                                }
                            };
                            Ok(out)
                        }
                        Err(err) => Err(GenerationError(err.to_string())),
                    };
                    if let Err(_) = ctx.channel.send(out) {
                        warn!("Failed to send decoded token back to the user")
                    }
                }
                Err(_err) => {
                    todo!("what do we do?")
                }
            }
        }
    }
 }
 fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
    engine_folder: P,
    executor_worker_path: PP,
 ) -> Result<(String, String), TensorRtLlmBackendError> {
    // Retrieve paths as &str for the backend creation
    let engine_folder = engine_folder.as_ref();
    let executor_worker_path = executor_worker_path.as_ref();
    // Ensure the engine folder exists
    if !engine_folder.exists() {
        let err = TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf());
        error!("Path validation failed: {}", err,);
        return Err(err);
    }
    // Ensure executor worker binary exists
    if !executor_worker_path.exists() {
        let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf());
        error!("Path validation failed: {}", err,);
        return Err(err);
    }
    let engine_folder = String::from(
        engine_folder
            .to_str()
            .expect("Failed to convert engine_folder to valid UTF-8"),
    );
    let executor_worker_path = String::from(
        executor_worker_path
            .to_str()
            .expect("Failed to convert executor_worker_path to valid UTF-8"),
    );
    Ok((engine_folder, executor_worker_path))
 }
 unsafe impl Send for TensorRtLlmBackendImpl {}
 pub struct TensorRtLlmBackendV2 {
    executor_looper: JoinHandle<()>,
    post_processor_looper: JoinHandle<()>,
    executor: UnboundedSender<GenerationContext>,
 }
 impl TensorRtLlmBackendV2 {
    pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
        tokenizer: Tokenizer,
        engine_folder: P,
        executor_worker_path: PP,
        max_inflight_requests: usize,
    ) -> Result<Self, TensorRtLlmBackendError> {
        let (engine_folder, executor_worker_path) =
            ensure_paths_exist(engine_folder, executor_worker_path)?;
        // Allocate the IPC layer to communicate with the backend
        let (executor_sender, executor_receiver) = unbounded_channel();
        let (post_processor_sender, post_processor_receiver) = unbounded_channel();
        // Create the FFI backend
        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
        // Executor looper is responsible for scheduling and pulling requests state at regular interval
        let executor_looper = spawn_blocking(move || {
            executor_status_looper(
                backend,
                max_inflight_requests,
                executor_receiver,
                post_processor_sender,
            )
        });
        // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
        let post_processor_looper = spawn_blocking(move || {
            post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
        });
        Ok(TensorRtLlmBackendV2 {
            executor_looper,
            post_processor_looper,
            executor: executor_sender,
        })
    }
    fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
        if request.input_ids.is_none() {
            return Err(ValidationError(UnsupportedModality("No token provided")));
        }
        if request.top_n_tokens > 1 {
            return Err(ValidationError(TopNTokensDisabled));
        }
        // TODO: Is it really needed? How can it be validated before?
        if request.parameters.grammar.is_some() {
            return Err(ValidationError(Grammar));
        }
        match request.inputs.len() {
            0 => Err(ValidationError(EmptyInput)),
            2.. => Err(GenerationError(
                "TensorRT-LLM backend don't support multi-chunk".into(),
            )),
            1 => match request.inputs.first().expect("Single item-chunk") {
                Chunk::Text(_) => Ok(()),
                Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))),
            },
        }
    }
 }
 #[async_trait]
 impl Backend for TensorRtLlmBackendV2 {
    fn schedule(
        &self,
        inner: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        Self::validate(&inner)?;
        // Open-up the stream to send tokens
        let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
        // Send the context to the executor for scheduling
        let queued = Instant::now();
        match self.executor.send(GenerationContext {
            request: inner,
            start: None,
            queued,
            streamer,
        }) {
            Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
            Err(_) => Err(GenerationError(
                "Failed to submit request to the backend".into(),
            )),
        }
    }
    async fn health(&self, _: bool) -> bool {
        !self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
    }
 }
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@ -1,10 +1,16 @@
 use std::path::{Path, PathBuf};
 use clap::Parser;
-use std::collections::HashMap;
+use hf_hub::api::tokio::{Api, ApiBuilder};
-use std::path::PathBuf;
+use hf_hub::{Cache, Repo, RepoType};
 use tokenizers::Tokenizer;
 use tracing::info;
 use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
-use text_generation_backends_trtllm::TensorRtLlmBackend;
+use text_generation_backends_trtllm::TensorRtLlmBackendV2;
-use text_generation_router::{server, usage_stats};
+use text_generation_router::server::get_base_tokenizer;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use text_generation_router::usage_stats::UsageStatsLevel;
 use text_generation_router::{server, HubTokenizerConfig};
 /// App Configuration
 #[derive(Parser, Debug)]
@ -58,6 +64,130 @@ struct Args {
    usage_stats: usage_stats::UsageStatsLevel,
 }
 async fn get_tokenizer(
    tokenizer_name: &str,
    tokenizer_config_path: Option<&str>,
    revision: Option<&str>,
 ) -> Option<Tokenizer> {
    // Parse Huggingface hub token
    let authorization_token = std::env::var("HF_TOKEN")
        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
        .ok();
    // Tokenizer instance
    let local_path = Path::new(tokenizer_name);
    // Shared API builder initialization
    let api_builder = || {
        let mut builder = ApiBuilder::new()
            .with_progress(false)
            .with_token(authorization_token);
        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
            builder = builder.with_cache_dir(cache_dir.into());
        }
        builder
    };
    // Decide if we need to use the API based on the revision and local path
    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
    // Initialize API if needed
    #[derive(Clone)]
    enum Type {
        Api(Api),
        Cache(Cache),
        None,
    }
    let api = if use_api {
        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
                .map_err(|_| ())
                .map(|cache_dir| Cache::new(cache_dir.into()))
                .unwrap_or_else(|_| Cache::default());
            tracing::warn!("Offline mode active using cache defaults");
            Type::Cache(cache)
        } else {
            tracing::info!("Using the Hugging Face API");
            match api_builder().build() {
                Ok(api) => Type::Api(api),
                Err(_) => {
                    tracing::warn!("Unable to build the Hugging Face API");
                    Type::None
                }
            }
        }
    } else {
        Type::None
    };
    // Load tokenizer and model info
    let (
        tokenizer_filename,
        _config_filename,
        tokenizer_config_filename,
        _preprocessor_config_filename,
        _processor_config_filename,
    ) = match api {
        Type::None => (
            Some(local_path.join("tokenizer.json")),
            Some(local_path.join("config.json")),
            Some(local_path.join("tokenizer_config.json")),
            Some(local_path.join("preprocessor_config.json")),
            Some(local_path.join("processor_config.json")),
        ),
        Type::Api(api) => {
            let api_repo = api.repo(Repo::with_revision(
                tokenizer_name.to_string(),
                RepoType::Model,
                revision.unwrap_or_else(|| "main").to_string(),
            ));
            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
                Ok(tokenizer_filename) => Some(tokenizer_filename),
                Err(_) => get_base_tokenizer(&api, &api_repo).await,
            };
            let config_filename = api_repo.get("config.json").await.ok();
            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
            (
                tokenizer_filename,
                config_filename,
                tokenizer_config_filename,
                preprocessor_config_filename,
                processor_config_filename,
            )
        }
        Type::Cache(cache) => {
            let repo = cache.repo(Repo::with_revision(
                tokenizer_name.to_string(),
                RepoType::Model,
                revision.clone().unwrap_or_else(|| "main").to_string(),
            ));
            (
                repo.get("tokenizer.json"),
                repo.get("config.json"),
                repo.get("tokenizer_config.json"),
                repo.get("preprocessor_config.json"),
                repo.get("processor_config.json"),
            )
        }
    };
    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
    {
        HubTokenizerConfig::from_file(filename)
    } else {
        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
    };
    tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
 }
 #[tokio::main]
 async fn main() -> Result<(), TensorRtLlmBackendError> {
    // Get args
@ -124,18 +254,26 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
        )));
    }
-    // Run server
+    // Create the backend
-    let tokenizer = Tokenizer::from_pretrained(
+    let tokenizer = get_tokenizer(
-        tokenizer_name.clone(),
+        &tokenizer_name,
-        Some(FromPretrainedParameters {
+        tokenizer_config_path.as_deref(),
-            revision: revision.clone().unwrap_or(String::from("main")),
+        revision.as_deref(),
            user_agent: HashMap::new(),
            auth_token,
        }),
    )
-    .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?;
+    .await
    .expect("Failed to retrieve tokenizer implementation");
-    let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?;
+    info!("Successfully retrieved tokenizer {}", &tokenizer_name);
    let backend = TensorRtLlmBackendV2::new(
        tokenizer,
        model_id,
        executor_worker,
        max_concurrent_requests,
    )?;
    info!("Successfully created backend");
    // Run server
    server::run(
        backend,
        max_concurrent_requests,
@ -145,7 +283,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
        max_input_tokens,
        max_total_tokens,
        validation_workers,
-        None,
+        auth_token,
        tokenizer_name,
        tokenizer_config_path,
        revision,
--- a/backends/trtllm/src/utils.rs
+++ b/backends/trtllm/src/utils.rs
@ -0,0 +1,22 @@
 ///
 /// Extract the first line of the provided string reference.
 /// If there is no lines in the buffer, it returns a string
 /// which content is defined by the content of `fail`
 /// # Arguments
 ///
 /// * `s`: The string buffer to extract the first-line from
 /// * `fail`: A string content which is returned if no lines are
 /// present in `s`
 ///
 /// returns: String
 ///
 /// # Examples
 ///
 /// ```
 /// let s = "My name is Morgan.\n I'm working at Hugging Face.";
 /// first_line(s, "No line in string");
 /// ```
 #[inline]
 pub(crate) fn first_line(s: &str, fail: &str) -> String {
    s.lines().next().unwrap_or(fail).to_string()
 }
--- a/backends/v3/src/client/grpc_client.rs
+++ b/backends/v3/src/client/grpc_client.rs
@ -108,20 +108,22 @@ impl Client {
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_tokens: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
-            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+            let mut truncate = max_prefill_tokens - n_tokens;
            if let Some(max_input_tokens) = max_input_tokens {
                truncate = min(max_input_tokens, truncate);
            }
            let mut input_chunks = Vec::new();
-            input_chunks
+            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
            if n_tokens == 0 {
                input_chunks.push(
                    Chunk::Image(Image {
@ -137,7 +139,7 @@ impl Client {
            // been updated to support chunks.
            let mut inputs = String::new();
-            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
@ -146,6 +148,12 @@ impl Client {
                ));
            }
            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
                max_total_tokens - truncate
            } else {
                1
            };
            requests.push(Request {
                id: 0,
                inputs,
@ -175,7 +183,7 @@ impl Client {
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: max_total_tokens - truncate,
+                    max_new_tokens,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
@ -183,7 +191,7 @@ impl Client {
                top_n_tokens: 20,
                adapter_id: None,
            });
-            n_tokens += max_input_length;
+            n_tokens += truncate;
            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
@ -195,19 +203,23 @@ impl Client {
            id: 0,
            size: requests.len() as u32,
            requests,
-            max_tokens: max_input_length,
+            max_tokens: max_input_tokens.unwrap_or(0),
            max_blocks: 0,
        };
        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
-            max_input_length,
+            max_input_tokens,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
-        Ok(response.max_supported_total_tokens)
+        Ok((
            response.max_supported_total_tokens,
            response.max_input_tokens,
            response.max_total_tokens,
        ))
    }
    /// Generate one token for each request in the given batch
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -102,11 +102,11 @@ impl ShardedClient {
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
-        max_input_length: u32,
+        max_input_length: Option<u32>,
        max_prefill_tokens: u32,
-        max_total_tokens: u32,
+        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
-    ) -> Result<Option<u32>> {
+    ) -> Result<(Option<u32>, u32, u32)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
@ -119,12 +119,19 @@ impl ShardedClient {
                ))
            })
            .collect();
        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
-            .collect::<Result<Vec<Option<u32>>>>()?;
+            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;
-        Ok(results.into_iter().flatten().min())
+
        // Take the minimum value
        // Different shards hold different parts of vocab, might yield
        // different available block size.
        let min = results
            .iter()
            .min()
            .expect("Expect at least 1 warmup result");
        Ok(*min)
    }
    /// Generate one token for each request in the given batch
--- a/backends/v3/src/lib.rs
+++ b/backends/v3/src/lib.rs
@ -37,12 +37,17 @@ pub struct BackendInfo {
    pub attention_impl: String,
    #[schema(example = "1")]
    pub block_size: u32,
    #[schema(example = "30000")]
    pub max_input_tokens: usize,
    #[schema(example = "32000")]
    pub max_total_tokens: usize,
 }
 #[allow(clippy::too_many_arguments)]
 pub async fn connect_backend(
-    max_input_tokens: usize,
+    max_input_tokens: Option<usize>,
-    max_total_tokens: usize,
+    max_total_tokens: Option<usize>,
    master_shard_uds_path: String,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
@ -51,14 +56,32 @@ pub async fn connect_backend(
    max_batch_size: Option<usize>,
 ) -> Result<(BackendV3, BackendInfo), V3Error> {
    // Helper function
-    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+    let check_max_batch_total_tokens = |(
        max_supported_batch_total_tokens,
        shard_max_input_tokens,
        shard_max_total_tokens,
    ): (Option<u32>, u32, u32)|
     -> Result<(u32, usize, usize), V3Error> {
        if let Some(max_input_tokens) = max_input_tokens {
            assert_eq!(max_input_tokens as u32, shard_max_input_tokens);
        }
        if let Some(max_total_tokens) = max_total_tokens {
            assert_eq!(max_total_tokens as u32, shard_max_total_tokens);
        }
        match max_supported_batch_total_tokens {
            // Older models do not support automatic max-batch-total-tokens
            None => {
-                let max_batch_total_tokens = max_batch_total_tokens
+                let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
-                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
+                    16000
                        .max(shard_max_total_tokens)
                        .max(max_batch_prefill_tokens),
                );
                tracing::warn!("Model does not support automatic max batch total tokens");
-                Ok(max_batch_total_tokens)
+                Ok((
                    max_batch_total_tokens,
                    shard_max_input_tokens as usize,
                    shard_max_total_tokens as usize,
                ))
            }
            // Flash attention models return their max supported total tokens
            Some(max_supported_batch_total_tokens) => {
@ -72,11 +95,15 @@ pub async fn connect_backend(
                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
                    );
                }
-                if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                if shard_max_total_tokens > max_supported_batch_total_tokens {
-                    return Err(V3Error::NotEnoughMemory(max_total_tokens));
+                    return Err(V3Error::NotEnoughMemory(shard_max_total_tokens as usize));
                }
-                Ok(max_supported_batch_total_tokens)
+                Ok((
                    max_supported_batch_total_tokens,
                    shard_max_input_tokens as usize,
                    shard_max_total_tokens as usize,
                ))
            }
        }
    };
@ -96,23 +123,25 @@ pub async fn connect_backend(
    // Warmup model
    tracing::info!("Warming up model");
-    let max_batch_total_tokens = check_max_batch_total_tokens(
+    let answer = sharded_client
        sharded_client
        .warmup(
-                max_input_tokens as u32,
+            max_input_tokens.map(|p| p as u32),
            max_batch_prefill_tokens,
-                max_total_tokens as u32,
+            max_total_tokens.map(|p| p as u32),
            max_batch_size,
        )
        .await
-            .map_err(V3Error::Warmup)?,
+        .map_err(V3Error::Warmup)?;
-    )?;
+    let (max_batch_total_tokens, max_input_tokens, max_total_tokens) =
        check_max_batch_total_tokens(answer)?;
    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
    metrics::gauge!("tgi_batch_max_total_tokens").set(max_batch_total_tokens);
    let backend_info = BackendInfo {
        waiting_served_ratio,
        max_batch_total_tokens,
        max_input_tokens,
        max_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        model_device_type: shard_info.device_type.clone(),
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@ -18,10 +18,10 @@ struct Args {
    max_stop_sequences: usize,
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,
-    #[clap(default_value = "1024", long, env)]
+    #[clap(long, env)]
-    max_input_tokens: usize,
+    max_input_tokens: Option<usize>,
-    #[clap(default_value = "2048", long, env)]
+    #[clap(long, env)]
-    max_total_tokens: usize,
+    max_total_tokens: Option<usize>,
    #[clap(default_value = "1.2", long, env)]
    waiting_served_ratio: f32,
    #[clap(default_value = "4096", long, env)]
@ -126,12 +126,6 @@ async fn main() -> Result<(), RouterError> {
    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);
    // Validate args
    if max_input_tokens >= max_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }
    if validation_workers == 0 {
        return Err(RouterError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
@ -160,6 +154,28 @@ async fn main() -> Result<(), RouterError> {
    // Validate remaining args now that the backend is known
    let support_chunking = backend_info.support_chunking;
    let max_batch_total_tokens = backend_info.max_batch_total_tokens;
    if max_input_tokens.is_none() {
        tracing::info!(
            "Maximum input tokens defaulted to {}",
            backend_info.max_input_tokens
        );
    }
    if max_total_tokens.is_none() {
        tracing::info!(
            "Maximum total tokens defaulted to {}",
            backend_info.max_total_tokens
        );
    }
    let max_input_tokens = backend_info.max_input_tokens;
    let max_total_tokens = backend_info.max_total_tokens;
    if max_input_tokens >= max_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }
    if max_input_tokens as u32 > max_batch_prefill_tokens && !support_chunking {
        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
    }
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "2.3.2-dev0"
+    "version": "2.4.1-dev0"
  },
  "paths": {
    "/": {
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes
 ```
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize bitsandbytes-nf4
 ```
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.3.1 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.0 --model-id $model --quantize gptq
 ```
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-rocm \
    --model-id $model
 ```
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-xpu \
    --model-id $model --cuda-graphs 0
 ```
@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu \
    --model-id $model --cuda-graphs 0
 ```
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.3.1 \
+    ghcr.io/huggingface/text-generation-inference:2.4.0 \
    --model-id $model
 ```
@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:2.3.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.4.0 --help
 ```
 </Tip>
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@ -163,7 +163,7 @@ hub = {
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.2"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.4.0"),
 env=hub,
 role=role,
 )
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@ -146,7 +146,7 @@ Options:
 ## MAX_INPUT_TOKENS
 ```shell
      --max-input-tokens <MAX_INPUT_TOKENS>
-          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_allocatable, max_position_embeddings) - 1
          [env: MAX_INPUT_TOKENS=]
@ -162,7 +162,7 @@ Options:
 ## MAX_TOTAL_TOKENS
 ```shell
      --max-total-tokens <MAX_TOTAL_TOKENS>
-          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_allocatable, max_position_embeddings)
          [env: MAX_TOTAL_TOKENS=]
--- a/flake.lock
+++ b/flake.lock
@ -853,11 +853,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1727836133,
+        "lastModified": 1729045942,
-        "narHash": "sha256-JE0zciM5IGWvK8J/pE2VldNBf7oyMH5WrU8tZArefbg=",
+        "narHash": "sha256-HjmK0x5Zm2TK2vFpC7XBM2e3EDNVnAIuEoU2FkeN8xw=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "02321540b0c8000b36889b1b974d1fec585b25a4",
+        "rev": "9de3cea452d2401d6f93c06ad985178a4e11d1fc",
        "type": "github"
      },
      "original": {
@ -978,16 +978,16 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1729531056,
+        "lastModified": 1729761651,
-        "narHash": "sha256-dW9IOA31+j3VS19WAWAmkJW2YCzeVZGqd6HpIJfODtI=",
+        "narHash": "sha256-GYykQ9Fxji2EuXCGcPn0dx8Qx8VQBJTkRdcCytp4A/k=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "a84a90281a17b15762873845c947e5c78f5a8dd1",
+        "rev": "f7e3c4fa67d70590ed9ee47feeab645bd9ba81b1",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
-        "ref": "marlin-kernels-0.3.0",
+        "ref": "marlin-kernels-0.3.1",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.1";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
@ -1,8 +1,8 @@
 {
  "details": {
    "best_of_sequences": null,
-    "finish_reason": "stop_sequence",
+    "finish_reason": "length",
-    "generated_tokens": 5,
+    "generated_tokens": 10,
    "prefill": [
      {
        "id": 128000,
@ -11,12 +11,12 @@
      },
      {
        "id": 2323,
-        "logprob": -9.5625,
+        "logprob": -9.5234375,
        "text": "Test"
      },
      {
        "id": 1715,
-        "logprob": -10.4375,
+        "logprob": -10.421875,
        "text": " request"
      }
    ],
@ -24,36 +24,66 @@
    "tokens": [
      {
        "id": 25,
-        "logprob": -0.8984375,
+        "logprob": -0.88183594,
        "special": false,
        "text": ":"
      },
      {
-        "id": 923,
+        "id": 2209,
-        "logprob": -2.84375,
+        "logprob": -2.6699219,
        "special": false,
-        "text": " add"
+        "text": " Is"
      },
      {
-        "id": 264,
+        "id": 279,
-        "logprob": 0.0,
+        "logprob": -0.61083984,
        "special": false,
-        "text": " a"
+        "text": " the"
      },
      {
        "id": 734,
        "logprob": -2.6660156,
        "special": false,
        "text": " function"
      },
      {
        "id": 330,
-        "logprob": -0.31640625,
+        "logprob": -0.35498047,
        "special": false,
        "text": " \""
      },
      {
-        "id": 1985,
+        "id": 4110,
-        "logprob": 0.0,
+        "logprob": -2.4101562,
        "special": false,
-        "text": "test"
+        "text": "Create"
      },
      {
        "id": 7575,
        "logprob": -2.2304688,
        "special": false,
        "text": "Process"
      },
      {
        "id": 1,
        "logprob": -0.080078125,
        "special": false,
        "text": "\""
      },
      {
        "id": 304,
        "logprob": -0.75439453,
        "special": false,
        "text": " in"
      },
      {
        "id": 12468,
        "logprob": -1.8769531,
        "special": false,
        "text": " Win"
      }
    ],
    "top_tokens": null
  },
-  "generated_text": "Test request: add a \"test"
+  "generated_text": "Test request: Is the function \"CreateProcess\" in Win"
 }
--- a/integration-tests/models/snapshots/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
@ -16,17 +16,17 @@
      },
      {
        "id": 5655,
-        "logprob": -11.75,
+        "logprob": -11.8359375,
        "text": " deep"
      },
      {
        "id": 6975,
-        "logprob": -2.0625,
+        "logprob": -2.0703125,
        "text": " learning"
      },
      {
        "id": 30,
-        "logprob": -6.0,
+        "logprob": -5.9765625,
        "text": "?"
      }
    ],
@ -40,25 +40,25 @@
      },
      {
        "id": 34564,
-        "logprob": -0.11279297,
+        "logprob": -0.12512207,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
-        "logprob": -0.16015625,
+        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 320,
-        "logprob": -0.25195312,
+        "logprob": -0.23840332,
        "special": false,
        "text": " ("
      },
      {
        "id": 16931,
-        "logprob": -1.703125,
+        "logprob": -2.0175781,
        "special": false,
        "text": "DL"
      },
@ -70,7 +70,7 @@
      },
      {
        "id": 374,
-        "logprob": -1.140625,
+        "logprob": -0.8613281,
        "special": false,
        "text": " is"
      },
@ -82,7 +82,7 @@
      },
      {
        "id": 1207,
-        "logprob": -1.3125,
+        "logprob": -1.2451172,
        "special": false,
        "text": " sub"
      },
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
@ -18,7 +18,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
@ -44,7 +44,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
@ -70,7 +70,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
@ -96,7 +96,7 @@
    "id": "",
    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "2.3.1-dev0-native",
+    "system_fingerprint": "2.4.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "2.3.1-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 50,
--- a/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_all_params.json
@ -26,7 +26,7 @@
      },
      {
        "id": 259,
-        "logprob": -0.46948242,
+        "logprob": -0.47070312,
        "special": false,
        "text": " "
      },
@ -38,7 +38,7 @@
      },
      {
        "id": 35622,
-        "logprob": -0.79589844,
+        "logprob": -0.796875,
        "special": false,
        "text": " cloud"
      },
@ -75,5 +75,5 @@
    ],
    "top_tokens": null
  },
-  "generated_text": "Why is the sky blue?blue sky, clouds and clouds"
+  "generated_text": "Why is the sky blue?blue sky , clouds and clouds"
 }
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 23,
    "prompt_tokens": 604,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@ -15,6 +15,6 @@
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion.chunk",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": null
 }
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
@ -15,6 +15,6 @@
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion.chunk",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "2.4.1-dev0-native",
  "usage": null
 }
--- a/integration-tests/models/test_bloom_560m.py
+++ b/integration-tests/models/test_bloom_560m.py
@ -3,7 +3,7 @@ import pytest
@pytest.fixture(scope="module")
 def bloom_560_handle(launcher):
-    with launcher("bigscience/bloom-560m") as handle:
+    with launcher("bigscience/bloom-560m", num_shard=1) as handle:
        yield handle
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
@ -55,6 +55,7 @@ async def test_flash_starcoder_gptq_load(
    )
    assert len(responses) == 4
-    assert all([r.generated_text == responses[0].generated_text for r in responses])
+    # XXX: TODO: Fix this test.
    # assert all([r.generated_text == responses[0].generated_text for r in responses])
-    assert responses == generous_response_snapshot
+    # assert responses == generous_response_snapshot
--- a/integration-tests/models/test_mamba.py
+++ b/integration-tests/models/test_mamba.py
@ -3,7 +3,7 @@ import pytest
@pytest.fixture(scope="module")
 def fused_kernel_mamba_handle(launcher):
-    with launcher("state-spaces/mamba-130m", num_shard=1) as handle:
+    with launcher("state-spaces/mamba-130m-hf", num_shard=1) as handle:
        yield handle
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@ -79,12 +79,12 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
    ]
    responses = await asyncio.gather(*futures)
-    generated_texts = [response.choices[0].message.content for response in responses]
+    _ = [response.choices[0].message.content for response in responses]
-    assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
+    # XXX: TODO: Fix this test.
-    assert len(generated_texts) == 4
+    # assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
-    assert generated_texts, all(
+    # assert len(generated_texts) == 4
-        [text == generated_texts[0] for text in generated_texts]
+    # assert generated_texts, all(
-    )
+    #     [text == generated_texts[0] for text in generated_texts]
-
+    # )
-    assert responses == response_snapshot
+    # assert responses == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -472,7 +472,7 @@ struct Args {
    /// for users. The larger this value, the longer prompt users can send which
    /// can impact the overall memory required to handle the load.
    /// Please note that some models have a finite range of sequence they can handle.
-    /// Default to min(max_position_embeddings - 1, 4095)
+    /// Default to min(max_allocatable, max_position_embeddings) - 1
    #[clap(long, env)]
    max_input_tokens: Option<usize>,
@ -488,7 +488,7 @@ struct Args {
    /// `1511` max_new_tokens.
    /// The larger this value, the larger amount each request will be in your RAM
    /// and the less effective batching can be.
-    /// Default to min(max_position_embeddings, 4096)
+    /// Default to min(max_allocatable, max_position_embeddings)
    #[clap(long, env)]
    max_total_tokens: Option<usize>,
@ -718,9 +718,9 @@ fn shard_manager(
    cuda_memory_fraction: f32,
    rope_scaling: Option<RopeScaling>,
    rope_factor: Option<f32>,
-    max_total_tokens: usize,
+    max_total_tokens: Option<usize>,
    max_batch_size: Option<usize>,
-    max_input_tokens: usize,
+    max_input_tokens: Option<usize>,
    lora_adapters: Option<String>,
    otlp_endpoint: Option<String>,
    otlp_service_name: String,
@ -805,8 +805,10 @@ fn shard_manager(
    shard_args.push(otlp_service_name);
    // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
    if let Some(max_input_tokens) = max_input_tokens {
        shard_args.push("--max-input-tokens".to_string());
        shard_args.push(max_input_tokens.to_string());
    }
    // Copy current process env
    let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
@ -854,10 +856,12 @@ fn shard_manager(
        envs.push(("ROPE_FACTOR".into(), factor.to_string().into()));
    }
    if let Some(max_total_tokens) = max_total_tokens {
        envs.push((
            "MAX_TOTAL_TOKENS".into(),
            max_total_tokens.to_string().into(),
        ));
    }
    if let Some(max_batch_size) = max_batch_size {
        envs.push(("MAX_BATCH_SIZE".into(), max_batch_size.to_string().into()));
    }
@ -1315,8 +1319,8 @@ fn spawn_shards(
    num_shard: usize,
    args: &Args,
    cuda_graphs: Vec<usize>,
-    max_total_tokens: usize,
+    max_total_tokens: Option<usize>,
-    max_input_tokens: usize,
+    max_input_tokens: Option<usize>,
    quantize: Option<Quantization>,
    max_log_level: LevelFilter,
    shutdown: Arc<AtomicBool>,
@ -1434,8 +1438,8 @@ fn compute_type(num_shard: usize) -> Option<String> {
 fn spawn_webserver(
    num_shard: usize,
    args: Args,
-    max_input_tokens: usize,
+    max_input_tokens: Option<usize>,
-    max_total_tokens: usize,
+    max_total_tokens: Option<usize>,
    max_batch_prefill_tokens: u32,
    shutdown: Arc<AtomicBool>,
    shutdown_receiver: &mpsc::Receiver<()>,
@ -1454,10 +1458,6 @@ fn spawn_webserver(
        args.max_stop_sequences.to_string(),
        "--max-top-n-tokens".to_string(),
        args.max_top_n_tokens.to_string(),
        "--max-input-tokens".to_string(),
        max_input_tokens.to_string(),
        "--max-total-tokens".to_string(),
        max_total_tokens.to_string(),
        "--max-batch-prefill-tokens".to_string(),
        max_batch_prefill_tokens.to_string(),
        "--waiting-served-ratio".to_string(),
@ -1475,6 +1475,18 @@ fn spawn_webserver(
        "--tokenizer-name".to_string(),
        args.model_id,
    ];
    if let Some(max_input_tokens) = max_input_tokens {
        router_args.extend_from_slice(&[
            "--max-input-tokens".to_string(),
            max_input_tokens.to_string(),
        ]);
    }
    if let Some(max_total_tokens) = max_total_tokens {
        router_args.extend_from_slice(&[
            "--max-total-tokens".to_string(),
            max_total_tokens.to_string(),
        ]);
    }
    // Pass usage stats flags to router
    router_args.push("--usage-stats".to_string());
@ -1704,35 +1716,19 @@ fn main() -> Result<(), LauncherError> {
                    format!("Both `max_input_tokens` ({max_input_tokens}) and `max_input_length` ({max_input_length}) are set. Please define only `max_input_tokens` as `max_input_length is deprecated for naming consistency.",
                )));
            }
-            (Some(max_input_tokens), None) | (None, Some(max_input_tokens)) => max_input_tokens,
+            (Some(max_input_tokens), None) | (None, Some(max_input_tokens)) => {
-            (None, None) => {
+                Some(max_input_tokens)
                let value = max_position_embeddings - 1;
                tracing::info!("Default `max_input_tokens` to {value}");
                value
            }
        }
    };
    let max_total_tokens = {
        match args.max_total_tokens {
            Some(max_total_tokens) => max_total_tokens,
            None => {
                let value = max_position_embeddings;
                tracing::info!("Default `max_total_tokens` to {value}");
                value
            }
            (None, None) => None,
        }
    };
    let max_total_tokens = args.max_total_tokens;
    let max_batch_prefill_tokens = {
        match args.max_batch_prefill_tokens {
            Some(max_batch_prefill_tokens) => max_batch_prefill_tokens,
            None => {
-                let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
+                // TODO figure out hardware optimal value
-                    max_batch_size * max_input_tokens
+                let value = 4096.min(max_position_embeddings as u32);
                } else {
                    // Adding some edge in order to account for potential block_size alignement
                    // issue.
                    max_input_tokens + 50
                } as u32;
                tracing::info!("Default `max_batch_prefill_tokens` to {value}");
                value
            }
@ -1740,11 +1736,13 @@ fn main() -> Result<(), LauncherError> {
    };
    // Validate args
    if let (Some(max_input_tokens), Some(max_total_tokens)) = (max_input_tokens, max_total_tokens) {
        if max_input_tokens >= max_total_tokens {
            return Err(LauncherError::ArgumentValidation(
-            "`max_input_tokens must be < `max_total_tokens`".to_string(),
+                    format!("`max_input_tokens`({max_input_tokens}) must be < `max_total_tokens`({max_total_tokens})"),
                ));
        }
    }
    if matches!(args.quantize, Some(Quantization::Bitsandbytes)) {
        tracing::warn!("Bitsandbytes is deprecated, use `eetq` instead, which provides better latencies overall and is drop-in in most cases.");
@ -1798,6 +1796,7 @@ fn main() -> Result<(), LauncherError> {
    }
    if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens {
        if let Some(max_total_tokens) = max_total_tokens {
            if max_total_tokens as u32 > *max_batch_total_tokens {
                return Err(LauncherError::ArgumentValidation(format!(
                    "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
@ -1805,6 +1804,7 @@ fn main() -> Result<(), LauncherError> {
                )));
            }
        }
    }
    if args.ngrok {
        if args.ngrok_authtoken.is_none() {
--- a/nix/server.nix
+++ b/nix/server.nix
@ -8,7 +8,6 @@
  eetq,
  einops,
  exllamav2,
  fbgemm-gpu,
  flashinfer,
  flash-attn,
  flash-attn-layer-norm,
@ -77,7 +76,6 @@ buildPythonPackage {
    causal-conv1d
    einops
    exllamav2
    fbgemm-gpu
    flashinfer
    flash-attn
    flash-attn-layer-norm
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@ -272,12 +272,18 @@ message DecodeResponse {
 message WarmupRequest {
  /// Batch to warmup on
  Batch batch = 1;
-  uint32 max_input_length = 2;
+  optional uint32 max_input_tokens = 2;
  uint32 max_prefill_tokens = 3;
-  uint32 max_total_tokens = 4;
+  optional uint32 max_total_tokens = 4;
 }
 message WarmupResponse {
  /// Maximum number of tokens supported by the model
  optional uint32 max_supported_total_tokens = 1;
  /// Maximum input tokens by clients should be equal to request value if it's set
  /// Otherwise warmup automatically allocates a value here
  uint32 max_input_tokens = 2;
  /// Maximum total tokens by clients should be equal to request value if it's set
  /// Otherwise warmup automatically allocates a value here
  uint32 max_total_tokens = 3;
 }
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -145,6 +145,7 @@ pub enum Config {
    LlavaNext(LlavaNext),
    ClipVisionModel(ClipVisionModel),
    Mistral,
    Mamba,
    Idefics,
    Mllama,
    Idefics2(Idefics2),
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -135,7 +135,7 @@ impl Infer {
    pub(crate) async fn tokenize(
        &self,
        request: GenerateRequest,
-    ) -> Result<Option<tokenizers::Encoding>, InferError> {
+    ) -> Result<tokenizers::Encoding, InferError> {
        // Tokenize request
        let inputs = request.inputs;
        let add_special_tokens = request.add_special_tokens;
@ -150,7 +150,7 @@ impl Infer {
            })?;
        // Return Encoding
-        Ok(encoding.map(|(encoding, _)| encoding))
+        Ok(encoding.0)
    }
    /// Apply the chat template to the chat request
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -14,11 +14,92 @@ mod vertex;
 use crate::infer::{Infer, InferError};
 use crate::server::prepare_chat_input;
 use pyo3::prelude::*;
 use pyo3::types::IntoPyDict;
 use serde::{Deserialize, Serialize};
 use tokenizers::Encoding;
 use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;
 #[derive(Clone)]
 pub enum Tokenizer {
    Python {
        tokenizer_name: String,
        revision: Option<String>,
    },
    Rust(tokenizers::Tokenizer),
 }
 pub struct PyTokenizer<'a>(pyo3::Bound<'a, pyo3::PyAny>);
 impl<'a> PyTokenizer<'a> {
    fn from_py(
        py: Python<'a>,
        tokenizer_name: String,
        revision: Option<String>,
    ) -> PyResult<PyTokenizer<'a>> {
        let transformers = py.import_bound("transformers")?;
        let auto = transformers.getattr("AutoTokenizer")?;
        let from_pretrained = auto.getattr("from_pretrained")?;
        let args = (tokenizer_name,);
        let kwargs = if let Some(rev) = &revision {
            [("revision", rev.to_string())].into_py_dict_bound(py)
        } else {
            pyo3::types::PyDict::new_bound(py)
        };
        let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
        tracing::info!("Loaded a python tokenizer");
        Ok(PyTokenizer(tokenizer))
    }
 }
 trait TokenizerTrait {
    fn encode_trait(
        &self,
        query: String,
        add_special_tokens: bool,
    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>>;
 }
 impl TokenizerTrait for tokenizers::Tokenizer {
    fn encode_trait(
        &self,
        query: String,
        add_special_tokens: bool,
    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>> {
        self.encode(query, add_special_tokens)
    }
 }
 impl<'a> TokenizerTrait for PyTokenizer<'a> {
    fn encode_trait(
        &self,
        query: String,
        add_special_tokens: bool,
    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>> {
        let py = self.0.py();
        let kwargs = [
            ("text", query.into_py(py)),
            ("add_special_tokens", add_special_tokens.into_py(py)),
        ]
        .into_py_dict_bound(py);
        let encode = self.0.getattr("encode")?;
        let input_ids: Vec<u32> = encode.call((), Some(&kwargs))?.extract()?;
        Ok(Encoding::new(
            input_ids,
            vec![],                           // type ids
            vec![],                           // tokens (strings)
            vec![],                           // words
            vec![],                           // offsets
            vec![],                           // special_tokens_mask
            vec![],                           // attention_mask
            vec![],                           // overflowing
            std::collections::HashMap::new(), //sequence_ranges
        ))
    }
 }
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
 pub struct HubModelInfo {
@ -1341,13 +1422,12 @@ impl Default for ModelsInfo {
 mod tests {
    use super::*;
    use serde_json::json;
    use tokenizers::Tokenizer;
-    pub(crate) async fn get_tokenizer() -> Tokenizer {
+    pub(crate) fn get_tokenizer() -> Tokenizer {
        let api = hf_hub::api::sync::Api::new().unwrap();
        let repo = api.model("gpt2".to_string());
        let filename = repo.get("tokenizer.json").unwrap();
-        Tokenizer::from_file(filename).unwrap()
+        Tokenizer::Rust(tokenizers::Tokenizer::from_file(filename).unwrap())
    }
    #[test]
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -19,7 +19,8 @@ use crate::{
    GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
    HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
    OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
-    TextMessage, Token, TokenizeResponse, ToolCallDelta, ToolCallMessage, Url, Usage, Validation,
+    TextMessage, Token, TokenizeResponse, Tokenizer, ToolCallDelta, ToolCallMessage, Url, Usage,
    Validation,
 };
 use crate::{
    ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
@ -45,6 +46,7 @@ use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
 use hf_hub::{Cache, Repo, RepoType};
 use http::header::AUTHORIZATION;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use pyo3::prelude::*;
 use pyo3::types::IntoPyDict;
 use regex::Regex;
 use serde_json::Value;
@ -54,7 +56,6 @@ use std::io::BufReader;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::{Path, PathBuf};
 use thiserror::Error;
 use tokenizers::Tokenizer;
 use tokio::select;
 use tokio::signal;
 use tokio::sync::oneshot;
@ -64,6 +65,41 @@ use tracing::{info_span, instrument, Instrument};
 use utoipa::OpenApi;
 use utoipa_swagger_ui::SwaggerUi;
 fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<SimpleToken> {
    let offsets = encoding.get_offsets();
    let input_ids = encoding.get_ids();
    if offsets.len() == input_ids.len() {
        input_ids
            .iter()
            .zip(offsets)
            .map(|(&id, &(start, stop))| {
                let text = input
                    .chars()
                    .skip(start)
                    .take(stop - start)
                    .collect::<String>();
                SimpleToken {
                    id,
                    text,
                    start,
                    stop,
                }
            })
            .collect()
    } else {
        encoding
            .get_ids()
            .iter()
            .map(|&id| SimpleToken {
                id,
                text: "".to_string(),
                start: 0,
                stop: 0,
            })
            .collect()
    }
 }
 /// Generate tokens if `stream == false` or a stream of token if `stream == true`
 #[utoipa::path(
 post,
@ -161,40 +197,14 @@ async fn get_chat_tokenize(
    let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
    let input = generate_request.inputs.clone();
    let encoding = infer.tokenize(generate_request).await?;
-    if let Some(encoding) = encoding {
+
-        let tokens: Vec<SimpleToken> = encoding
+    let tokens = encoding_to_tokens(&encoding, &input);
            .get_ids()
            .iter()
            .zip(encoding.get_offsets())
            .map(|(&id, &(start, stop))| {
                let text = input
                    .chars()
                    .skip(start)
                    .take(stop - start)
                    .collect::<String>();
                SimpleToken {
                    id,
                    text,
                    start,
                    stop,
                }
            })
            .collect();
    let resp = ChatTokenizeResponse {
        tokenize_response: TokenizeResponse(tokens),
        templated_text: input,
    };
    Ok((HeaderMap::new(), Json(resp)))
    } else {
        Err((
            StatusCode::NOT_FOUND,
            Json(ErrorResponse {
                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
                error_type: "no fast tokenizer".to_string(),
            }),
        ))
    }
 }
 #[utoipa::path(
@ -1458,35 +1468,8 @@ async fn tokenize(
 ) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
    let input = req.inputs.clone();
    let encoding = infer.tokenize(req).await?;
-    if let Some(encoding) = encoding {
+    let tokens = encoding_to_tokens(&encoding, &input);
        let tokens: Vec<SimpleToken> = encoding
            .get_ids()
            .iter()
            .zip(encoding.get_offsets())
            .map(|(&id, &(start, stop))| {
                let text = input
                    .chars()
                    .skip(start)
                    .take(stop - start)
                    .collect::<String>();
                SimpleToken {
                    id,
                    text,
                    start,
                    stop,
                }
            })
            .collect();
    Ok(Json(TokenizeResponse(tokens)))
    } else {
        Err((
            StatusCode::NOT_FOUND,
            Json(ErrorResponse {
                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
                error_type: "no fast tokenizer".to_string(),
            }),
        ))
    }
 }
 /// Prometheus metrics scrape endpoint
@ -1594,6 +1577,71 @@ pub fn schema() -> ApiDoc {
    ApiDoc
 }
 fn py_resolve_tokenizer(
    py: pyo3::Python,
    tokenizer_name: &str,
    revision: Option<&str>,
    trust_remote_code: bool,
 ) -> pyo3::PyResult<()> {
    let transformers = py.import_bound("transformers")?;
    let auto = transformers.getattr("AutoTokenizer")?;
    let from_pretrained = auto.getattr("from_pretrained")?;
    let args = (tokenizer_name,);
    let kwargs = if let Some(rev) = &revision {
        [
            ("revision", rev.to_string().into_py(py)),
            ("trust_remote_code", trust_remote_code.into_py(py)),
        ]
        .into_py_dict_bound(py)
    } else {
        [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
    };
    let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
    let save = tokenizer.getattr("save_pretrained")?;
    let args = ("out".to_string(),);
    save.call1(args)?;
    Ok(())
 }
 fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
    // XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
    // and state-spaces/mamba-130m
    tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");
    #[derive(serde::Deserialize)]
    struct FallbackConfig {
        base_model_name_or_path: Option<String>,
        model_type: Option<String>,
        ssm_config: Option<serde_json::Value>,
    }
    config_filename.and_then(|filename| {
        std::fs::read_to_string(filename)
            .ok()
            .as_ref()
            .and_then(|c| {
                let config: Result<FallbackConfig, _> = serde_json::from_str(c);
                if let Ok(config) = config {
                    if config.model_type.is_none() {
                        if let Some(base) = config.base_model_name_or_path {
                            pyo3::Python::with_gil(|py| -> PyResult<()> {
                                py_resolve_tokenizer(py, &base, Some("main"), false)
                            })
                            .ok()?;
                        }
                    }
                    if config.ssm_config.is_some() {
                        // XXX Legacy mamba
                        pyo3::Python::with_gil(|py| -> PyResult<()> {
                            py_resolve_tokenizer(py, "EleutherAI/gpt-neox-20b", Some("main"), false)
                        })
                        .ok()?;
                    }
                }
                Some(())
            })
    })
 }
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
@ -1687,7 +1735,6 @@ pub async fn run(
    // Load tokenizer and model info
    let (
        tokenizer_filename,
        config_filename,
        tokenizer_config_filename,
        preprocessor_config_filename,
@ -1695,7 +1742,6 @@ pub async fn run(
        model_info,
    ) = match api {
        Type::None => (
            Some(local_path.join("tokenizer.json")),
            Some(local_path.join("config.json")),
            Some(local_path.join("tokenizer_config.json")),
            Some(local_path.join("preprocessor_config.json")),
@ -1709,10 +1755,6 @@ pub async fn run(
                revision.clone().unwrap_or_else(|| "main".to_string()),
            ));
            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
                Ok(tokenizer_filename) => Some(tokenizer_filename),
                Err(_) => get_base_tokenizer(&api, &api_repo).await,
            };
            let config_filename = api_repo.get("config.json").await.ok();
            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
@ -1725,7 +1767,6 @@ pub async fn run(
                None
            };
            (
                tokenizer_filename,
                config_filename,
                tokenizer_config_filename,
                preprocessor_config_filename,
@ -1740,7 +1781,6 @@ pub async fn run(
                revision.clone().unwrap_or_else(|| "main".to_string()),
            ));
            (
                repo.get("tokenizer.json"),
                repo.get("config.json"),
                repo.get("tokenizer_config.json"),
                repo.get("preprocessor_config.json"),
@ -1762,39 +1802,30 @@ pub async fn run(
        HubTokenizerConfig::default()
    });
-    let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
+    let tokenizer: Tokenizer = {
        use pyo3::prelude::*;
-        let convert = pyo3::Python::with_gil(|py| -> PyResult<()> {
+        pyo3::Python::with_gil(|py| -> PyResult<()> {
-            let transformers = py.import_bound("transformers")?;
+            py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), trust_remote_code)?;
            let auto = transformers.getattr("AutoTokenizer")?;
            let from_pretrained = auto.getattr("from_pretrained")?;
            let args = (tokenizer_name.to_string(),);
            let kwargs = [
                (
                    "revision",
                    (revision.clone().unwrap_or_else(|| "main".to_string())).into_py(py),
                ),
                ("trust_remote_code", trust_remote_code.into_py(py)),
            ]
            .into_py_dict_bound(py);
            let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
            let save = tokenizer.getattr("save_pretrained")?;
            let args = ("out".to_string(),);
            save.call1(args)?;
            Ok(())
        })
        .inspect_err(|err| {
            tracing::error!("Failed to import python tokenizer {err}");
-        });
+        })
-        let filename = if convert.is_ok() {
+        .or_else(|err| {
-            // If we have correctly loaded and resaved with transformers
+            let out = legacy_tokenizer_handle(config_filename.as_ref());
-            // We might have modified the tokenizer.json according to transformers
+            out.ok_or(err)
-            "out/tokenizer.json".into()
+        })
        .expect("We cannot load a tokenizer");
        let filename = "out/tokenizer.json";
        if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
            Tokenizer::Rust(tok)
        } else {
-            filename
+            Tokenizer::Python {
                tokenizer_name: tokenizer_name.clone(),
                revision: revision.clone(),
            }
        }
    };
        Tokenizer::from_file(filename).ok()
    });
    let config: Option<Config> = config_filename.and_then(|filename| {
        std::fs::read_to_string(filename)
@ -1822,10 +1853,6 @@ pub async fn run(
        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
    tracing::info!("Using config {config:?}");
    if tokenizer.is_none() {
        tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
        tracing::warn!("Rust input length validation and truncation is disabled");
    }
    // Only send usage stats when TGI is run in container and the function returns Some
    let is_container = matches!(usage_stats::is_container(), Ok(true));
@ -1940,7 +1967,7 @@ async fn start(
    validation_workers: usize,
    api_key: Option<String>,
    config: Option<Config>,
-    (tokenizer, tokenizer_config): (Option<Tokenizer>, HubTokenizerConfig),
+    (tokenizer, tokenizer_config): (Tokenizer, HubTokenizerConfig),
    (preprocessor_config, processor_config): (Option<HubPreprocessorConfig>, HubProcessorConfig),
    hostname: String,
    port: u16,
@ -2400,30 +2427,6 @@ pub async fn get_hub_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
    }
 }
 /// get base tokenizer
 pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
    let config_filename = api_repo.get("config.json").await.ok()?;
    // Open the file in read-only mode with buffer.
    let file = File::open(config_filename).ok()?;
    let reader = BufReader::new(file);
    // Read the JSON contents of the file as an instance of `User`.
    let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
    if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
        let api_base_repo = api.repo(Repo::with_revision(
            base_model_id.to_string(),
            RepoType::Model,
            "main".to_string(),
        ));
        api_base_repo.get("tokenizer.json").await.ok()
    } else {
        None
    }
 }
 /// get tokenizer_config from the Huggingface Hub
 pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
@ -2566,10 +2569,11 @@ mod tests {
    use crate::TokenizerConfigToken;
    use crate::Tool;
    use crate::tests::get_tokenizer;
    use serde_json::json;
-    #[test]
+    #[tokio::test]
-    fn test_prepare_chat_input() {
+    async fn test_prepare_chat_input() {
        // Mock Backend to avoid network requests
        struct MockBackend;
@ -2610,9 +2614,11 @@ mod tests {
            ChatTemplateVersions::Single("{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n".to_string())
        );
        let tokenizer = get_tokenizer();
        let infer = Infer::new(
            backend,
-            Validation::new(1, None, None, None, 1, 1, 1, 1, 1, false),
+            Validation::new(1, tokenizer, None, None, 1, 1, 1, 1, 1, false),
            1,
            tokenizer_config,
            HubProcessorConfig::default(),
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -3,7 +3,9 @@ use crate::config::Config;
 use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
 use crate::{
    GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
    TokenizerTrait,
 };
 use crate::{PyTokenizer, Tokenizer};
 use base64::{engine::general_purpose::STANDARD, Engine};
 use image::{ImageFormat, ImageReader};
 use jsonschema::{Draft, JSONSchema};
@ -13,7 +15,6 @@ use std::io::Cursor;
 use std::iter;
 use std::sync::Arc;
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
 use tokio::sync::mpsc;
 use tokio::sync::oneshot;
 use tracing::{instrument, Span};
@ -30,14 +31,14 @@ pub struct Validation {
    max_total_tokens: usize,
    disable_grammar_support: bool,
    /// Channel to communicate with the background tokenization task
-    sender: Option<mpsc::UnboundedSender<TokenizerRequest>>,
+    sender: mpsc::UnboundedSender<TokenizerRequest>,
 }
 impl Validation {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        workers: usize,
-        tokenizer: Option<Tokenizer>,
+        tokenizer: Tokenizer,
        config: Option<Config>,
        preprocessor_config: Option<HubPreprocessorConfig>,
        max_best_of: usize,
@ -47,8 +48,13 @@ impl Validation {
        max_total_tokens: usize,
        disable_grammar_support: bool,
    ) -> Self {
        let workers = if let Tokenizer::Python { .. } = &tokenizer {
            1
        } else {
            workers
        };
        // If we have a fast tokenizer
-        let sender = if let Some(tokenizer) = tokenizer {
+        let sender = {
            // Create round robin channel
            let (validation_sender, validation_round_robin_receiver) = mpsc::unbounded_channel();
            let mut senders = Vec::with_capacity(workers);
@ -75,9 +81,7 @@ impl Validation {
            // Create tokenization round robin task
            tokio::spawn(round_robin_task(validation_round_robin_receiver, senders));
-            Some(validation_sender)
+            validation_sender
        } else {
            None
        };
        Self {
@ -97,14 +101,14 @@ impl Validation {
        inputs: String,
        add_special_tokens: bool,
        truncate: Option<usize>,
-    ) -> Result<Option<(tokenizers::Encoding, Vec<Chunk>)>, ValidationError> {
+    ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send request to the background validation task
        // Unwrap is safe here
-            sender
+        let _ = &self
            .sender
            .send((
                (inputs, add_special_tokens, truncate),
                response_sender,
@ -115,10 +119,7 @@ impl Validation {
        // Await on response channel
        // Unwrap is safe here
        let encoding = response_receiver.await.unwrap()?;
-            Ok(Some(encoding))
+        Ok(encoding)
        } else {
            Ok(None)
        }
    }
    #[allow(clippy::type_complexity)]
@ -131,10 +132,9 @@ impl Validation {
        max_new_tokens: Option<u32>,
    ) -> Result<(Vec<Chunk>, Option<Vec<u32>>, usize, u32), ValidationError> {
        // If we have a fast tokenizer
-        if let Some((encoding, inputs)) = self
+        let (encoding, inputs) = self
            .tokenize(inputs.clone(), add_special_tokens, truncate)
-            .await?
+            .await?;
        {
        // Create response channel
        let input_length = if let Some(truncate) = truncate {
            std::cmp::min(encoding.len(), truncate)
@ -173,35 +173,6 @@ impl Validation {
        metrics::histogram!("tgi_request_input_length").record(input_length as f64);
        Ok((inputs, Some(input_ids), input_length, max_new_tokens))
    }
        // Return inputs without validation
        else {
            // In this case, we don't know the real length in tokens of the inputs
            // However, the inputs will be truncated by the python servers
            // We make sure that truncate + max_new_tokens <= self.max_total_tokens
            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
                max_new_tokens
            } else if let Some(truncate) = truncate {
                self.max_total_tokens.saturating_sub(truncate) as u32
            } else {
                return Err(ValidationError::UnsetMaxNewTokens);
            };
            let mut input_length = truncate.unwrap_or(self.max_input_length);
            // We don't have a tokenizer, therefore we have no idea how long is the query, let
            // them through and hope for the best.
            // Validate MaxNewTokens
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
                input_length = input_length.saturating_sub(max_new_tokens as usize);
            }
            Ok((
                vec![Chunk::Text(inputs)],
                None,
                input_length,
                max_new_tokens,
            ))
        }
    }
    /// Validate a payload and get the number of tokens in the input
    #[instrument(skip_all)]
@ -464,6 +435,13 @@ fn tokenizer_worker(
    preprocessor_config: Option<HubPreprocessorConfig>,
    mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
 ) {
    match tokenizer {
        Tokenizer::Python {
            tokenizer_name,
            revision,
        } => {
            pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
                let tokenizer = PyTokenizer::from_py(py, tokenizer_name, revision)?;
                // Loop over requests
                while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                    receiver.blocking_recv()
@ -481,6 +459,29 @@ fn tokenizer_worker(
                            .unwrap_or(())
                    })
                }
                Ok(())
            })
            .expect("Failure in python tokenizer worker");
        }
        Tokenizer::Rust(tokenizer) => {
            while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                receiver.blocking_recv()
            {
                parent_span.in_scope(|| {
                    response_tx
                        .send(prepare_input(
                            inputs,
                            truncate,
                            add_special_tokens,
                            &tokenizer,
                            config.as_ref(),
                            preprocessor_config.as_ref(),
                        ))
                        .unwrap_or(())
                })
            }
        }
    }
 }
 fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
@ -608,11 +609,11 @@ fn image_tokens_fixup(config: &Config, text: String) -> String {
 }
 /// Get input length and optionally truncate it
-fn prepare_input(
+fn prepare_input<T: TokenizerTrait>(
    inputs: String,
    _truncate: Option<usize>,
    add_special_tokens: bool,
-    tokenizer: &Tokenizer,
+    tokenizer: &T,
    config: Option<&Config>,
    preprocessor_config: Option<&HubPreprocessorConfig>,
 ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
@ -649,7 +650,7 @@ fn prepare_input(
    // Get the number of tokens in the input
    let encoding = tokenizer
-        .encode(tokenizer_query, add_special_tokens)
+        .encode_trait(tokenizer_query, add_special_tokens)
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
    Ok((encoding, input_chunks))
@ -824,7 +825,7 @@ mod tests {
    #[tokio::test]
    async fn test_validation_max_new_tokens() {
-        let tokenizer = None;
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -851,15 +852,15 @@ mod tests {
            .validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
            .await
        {
-            // Err(ValidationError::MaxNewTokens(1, 10)) => (),
+            Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
-            Ok((_s, _, 0, 10)) => (),
+            // Ok((_s, _, 0, 10)) => (),
            r => panic!("Unexpected not max new tokens: {r:?}"),
        }
    }
    #[tokio::test]
    async fn test_validation_input_length() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -893,7 +894,7 @@ mod tests {
    #[tokio::test]
    async fn test_validation_best_of_sampling() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -933,7 +934,7 @@ mod tests {
    #[tokio::test]
    async fn test_validation_top_p() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
@ -1004,7 +1005,7 @@ mod tests {
    #[tokio::test]
    async fn test_validation_top_n_tokens() {
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequences = 3;
        let max_top_n_tokens = 4;
@ -1089,7 +1090,7 @@ mod tests {
    async fn test_prepare_input_chunks() {
        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
@ -1124,7 +1125,7 @@ mod tests {
            )
            .await
        {
-            Ok(Some((_encoding, chunks))) => chunks,
+            Ok((_encoding, chunks)) => chunks,
            _ => panic!("Unexpected tokenization failure"),
        };
@ -1146,7 +1147,7 @@ mod tests {
    async fn test_idefics2_correct_n_fake_tokens() {
        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();
-        let tokenizer = Some(get_tokenizer().await);
+        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
@ -1184,7 +1185,7 @@ mod tests {
            )
            .await
        {
-            Ok(Some((encoding, chunks))) => (encoding, chunks),
+            Ok((encoding, chunks)) => (encoding, chunks),
            _ => panic!("Unexpected tokenization failure"),
        };
--- a/server/Makefile
+++ b/server/Makefile
@ -5,7 +5,6 @@ include Makefile-awq
 include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
 include Makefile-fbgemm
 include Makefile-exllamav2
 include Makefile-flashinfer
@ -30,7 +29,7 @@ install-server: gen-server
 install: install-cuda
 	echo "Installed server"
-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
+install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
 	pip install -e ".[bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3
--- a/server/Makefile-fbgemm
+++ b/server/Makefile-fbgemm
@ -1,15 +0,0 @@
 fbgemm_commit := v0.8.0
 build-fbgemm:
 	@if [ ! -d "fbgemm" ]; then \
 		git clone https://github.com/pytorch/FBGEMM.git fbgemm; \
 	fi
 	cd fbgemm && git fetch && git checkout $(fbgemm_commit)  && \
 	git submodule update --init --recursive && \
 	cd fbgemm_gpu && \
 	pip install -r requirements.txt && \
 	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai build
 install-fbgemm: build-fbgemm
 	cd fbgemm/fbgemm_gpu &&  \
 	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai install
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -529,88 +529,103 @@ typing = ["typing-extensions (>=4.12.2)"]
 [[package]]
 name = "frozenlist"
-version = "1.4.1"
+version = "1.5.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"},
-    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"},
-    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"},
    {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"},
    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"},
    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"},
    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"},
    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"},
    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"},
    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"},
    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"},
    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"},
    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"},
    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"},
    {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"},
    {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"},
    {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"},
    {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"},
 ]
 [[package]]
@ -1185,12 +1200,12 @@ files = [
 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.1"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:a2086b9e98d22071f52c5b4b4b98b1b4a988565258905173fa74c5a9eddd1a0a"},
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:705c89ed54977099a40b37dc0c796964649024f1a8819a1832118cd7b146efe1"},
 ]
 [package.dependencies]
@ -1198,16 +1213,16 @@ torch = "*"
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.1"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:f39a6946d8247629446ec170832d832c7038c363f1d8803211fe67249c2d804d"},
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:e1f3d123eca643149d0a4f6b81c4405d78abb3a694a78fccc8670a25b3404406"},
 ]
 [package.dependencies]
@ -1215,16 +1230,16 @@ torch = "*"
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.1"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:07fd869d5289777fa866107dae676523e18b1f6ba4afce79946ddc58a6870169"},
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:9d68367fd5e1caf2edc90b77ad5d074b11586012265a3147ecca1f1171ae22f8"},
 ]
 [package.dependencies]
@ -1232,16 +1247,16 @@ torch = "*"
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
 [[package]]
 name = "marlin-kernels"
-version = "0.3.0"
+version = "0.3.1"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:0dedaa418225d490a5f1d8f85dbc75e439a8c43a8870e4ef32945bf61672d7dc"},
+    {file = "marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:d962277c5f7642972e298650913dd0546b9f735b706dc88bb34955b3cac7f330"},
 ]
 [package.dependencies]
@ -1249,7 +1264,7 @@ torch = "*"
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
 [[package]]
 name = "mdurl"
@ -3460,13 +3475,13 @@ telegram = ["requests"]
 [[package]]
 name = "transformers"
-version = "4.45.2"
+version = "4.46.0"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"},
+    {file = "transformers-4.46.0-py3-none-any.whl", hash = "sha256:e161268ae8bee315eb9e9b4c0b27f1bd6980f91e0fc292d75249193d339704c0"},
-    {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"},
+    {file = "transformers-4.46.0.tar.gz", hash = "sha256:3a9e2eb537094db11c3652334d281afa4766c0e5091c4dcdb454e9921bb0d2b7"},
 ]
 [package.dependencies]
@ -3484,13 +3499,13 @@ tqdm = ">=4.27"
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
 agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
 codecarbon = ["codecarbon (==1.2.0)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
 deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"]
 dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
@ -3524,7 +3539,7 @@ torch = ["accelerate (>=0.26.0)", "torch"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
 torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)", "decord (==0.6.0)"]
+video = ["av (==9.2.0)"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 [[package]]
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -41,10 +41,10 @@ py-cpuinfo = "^9.0.0"
 numpy = "^1.26"
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 moe-kernels = [
  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -1,7 +1,8 @@
 import torch
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union, List
+import os
 from typing import Optional, Tuple, Type, Union, List
 import torch
 from loguru import logger
 from text_generation_server.utils.import_utils import SYSTEM
@ -11,20 +12,7 @@ from text_generation_server.utils.weights import (
    UnquantizedWeight,
    Weights,
 )
-from text_generation_server.utils.log import log_master, log_once
+from text_generation_server.utils.log import log_once
 import importlib.util
 FBGEMM_MM_AVAILABLE = False
 FBGEMM_DYN_AVAILABLE = False
 def is_fbgemm_gpu_available():
    try:
        return importlib.util.find_spec("fbgemm_gpu.experimental.gen_ai") is not None
    except ModuleNotFoundError:
        return False
 try:
    import marlin_kernels
@ -32,23 +20,26 @@ except ImportError:
    marlin_kernels = None
-if is_fbgemm_gpu_available():
+if SYSTEM == "cuda" and marlin_kernels is not None:
-    if SYSTEM == "cuda":
+    major, minor = torch.cuda.get_device_capability()
-        major, _ = torch.cuda.get_device_capability()
+    CUTLASS_FP8_AVAILABLE = marlin_kernels.cutlass_scaled_mm_supports_fp8(
-        FBGEMM_MM_AVAILABLE = major == 9
+        major * 10 + minor
-        FBGEMM_DYN_AVAILABLE = major >= 8
+    )
 else:
-    log_master(logger.warning, "FBGEMM fp8 kernels are not installed.")
+    CUTLASS_FP8_AVAILABLE = False
-def get_fp8_linear() -> torch.nn.Module:
+def get_fp8_linear() -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
    if SYSTEM == "cuda":
        major, _ = torch.cuda.get_device_capability()
-        if major == 8:
+        if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
            #       gives better decoding throughput on L4 and L40.
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
            return GPTQMarlinFP8Linear
@ -94,12 +85,6 @@ def fp8_quantize(
    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
    be used without modification).
    """
    if FBGEMM_DYN_AVAILABLE and not scalar and not scale:
        qweight, scale = torch.ops.fbgemm.quantize_fp8_per_row(
            weight, bs=None, scale_ub=scale_upper_bound, output_dtype=qdtype
        )
        return qweight, scale
    if marlin_kernels is not None:
        shape = weight.shape
        qweight, scale = marlin_kernels.scaled_fp8_quant(
@ -107,11 +92,12 @@ def fp8_quantize(
            dtype=qdtype,
            scale=scale,
            scale_ub=scale_upper_bound,
            # TODO: don't do this when we have to use the Torch kernel.
            use_per_token_if_dynamic=not scalar,
        )
        return qweight.reshape(shape), scale
    # weight, scale = quant_weights(weight, torch.int8, False)
    finfo = torch.finfo(qdtype)
    if scale is None:
@ -327,8 +313,8 @@ class Fp8Linear(torch.nn.Module):
        scale_upper_bound: Optional[float] = None,
    ) -> None:
        super().__init__()
-        if FBGEMM_MM_AVAILABLE:
+        if CUTLASS_FP8_AVAILABLE:
-            log_once(logger.info, "Using FBGEMM fp8 optimized kernels")
+            log_once(logger.info, "Using cutlass w8a8 kernels")
        if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn:
            qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                weight=qweight, weight_scale=scale
@ -339,13 +325,9 @@ class Fp8Linear(torch.nn.Module):
        self.scale = scale.float()
        self.input_scale = input_scale.float() if input_scale is not None else None
-        if FBGEMM_MM_AVAILABLE:
+        if CUTLASS_FP8_AVAILABLE and scale_upper_bound is not None:
-            self.scale_upper_bound = (
+            self.scale_upper_bound = torch.tensor(
-                torch.tensor(
+                scale_upper_bound, dtype=torch.float32, device=qweight.device
                    [scale_upper_bound], dtype=torch.float32, device=qweight.device
                )
                if scale_upper_bound is not None
                else None
            )
        else:
            self.scale_upper_bound = scale_upper_bound
@ -354,7 +336,7 @@ class Fp8Linear(torch.nn.Module):
    @classmethod
    def from_unquant(cls, weight, bias, dtype):
-        qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE)
+        qweight, scale = fp8_quantize(weight, scalar=not CUTLASS_FP8_AVAILABLE)
        return cls(
            qweight=qweight,
            scale=scale,
@ -376,9 +358,6 @@ class Fp8Linear(torch.nn.Module):
        input_scale = kwargs.get("input_scale", None)
        scale_upper_bound = kwargs.get("scale_upper_bound", None)
        if FBGEMM_DYN_AVAILABLE:
            # fbgemm needs float32 scales.
            scale = scale.float()
        return cls(
            qweight=weight,
            scale=scale,
@ -397,20 +376,14 @@ class Fp8Linear(torch.nn.Module):
        return cls._device_identity_cache[device]
    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if FBGEMM_MM_AVAILABLE:
+        if CUTLASS_FP8_AVAILABLE:
            # cutlass FP8 supports per-token scales, so get non-scalar scales.
            qinput, scale = fp8_quantize(
-                input, scale_upper_bound=self.scale_upper_bound
+                input, scale_upper_bound=self.scale_upper_bound, scalar=False
            )
-
+            return marlin_kernels.cutlass_scaled_mm(
-            y = torch.ops.fbgemm.f8f8bf16_rowwise(
+                qinput, self.qweight.t(), scale, self.scale, input.dtype, self.bias
                qinput,
                self.qweight,
                scale,
                self.scale,
                use_fast_accum=True,
                bias=self.bias,
            )
            return y.to(self.dtype)
        qinput, scale = fp8_quantize(
            input,
--- a/server/text_generation_server/layers/gptq/init.py
+++ b/server/text_generation_server/layers/gptq/init.py
@ -10,8 +10,8 @@ from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 if SYSTEM == "ipex":
    from .ipex import QuantLinear
-elif SYSTEM == "cuda":
+elif SYSTEM in {"cuda", "rocm"}:
-    from .cuda import QuantLinear
+    from .triton import QuantLinear
@dataclass
--- a/server/text_generation_server/layers/gptq/triton.py
+++ b/server/text_generation_server/layers/gptq/triton.py
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -226,7 +226,7 @@ class ModelType(enum.Enum):
        "url": "https://huggingface.co/databricks/dbrx-instruct",
    }
    MAMBA = {
-        "type": "ssm",
+        "type": "mamba",
        "name": "Mamba",
        "url": "https://huggingface.co/state-spaces/mamba-2.8b-slimpj",
    }
@ -410,12 +410,6 @@ def get_model(
            else:
                # These quantizers only work with float16 params.
                dtype = torch.float16
        elif quantize == "fp8":
            from text_generation_server.layers.fp8 import FBGEMM_DYN_AVAILABLE
            if FBGEMM_DYN_AVAILABLE:
                # fbgemm kernels are fp8xfp8->bf16
                dtype = torch.bfloat16
        else:
            # Keep it as default for now and let
            # every model resolve their own default dtype.
@ -624,6 +618,10 @@ def get_model(
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
    elif model_type == "ssm":
        raise RuntimeError(
            "`ssm` models have been deprecated in favor of `mamba` models, which follow standard HF formats. Check out a list here: https://huggingface.co/models?search=mamba%20-hf"
        )
    if model_id.startswith("facebook/galactica"):
        return CausalLM(
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@ -196,6 +196,9 @@ class MambaModel(nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        prefix = "backbone"
        try:
            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embeddings", weights)
        except RuntimeError:
            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
        self.blocks = nn.ModuleList(
            [
@ -206,7 +209,10 @@ class MambaModel(nn.Module):
        self.norm_f = FastRMSNorm.load(
            f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
        )
-        self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
+        try:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
        self.config = config
    def forward(
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -71,6 +71,14 @@ from text_generation_server.utils.import_utils import (
    synchronize,
    get_free_memory,
 )
 from text_generation_server.models.metadata_kernels import (
    has_triton,
    copy_next_input_ids_inplace,
    block_tables_to_ragged,
    block_tables_to_padded,
    prepare_position_slot_ids,
    slots_filtering,
 )
 tracer = trace.get_tracer(__name__)
@ -78,6 +86,10 @@ tracer = trace.get_tracer(__name__)
 SLIDING_WINDOW: Optional[int] = None
 def small_power_of_2(n: int):
    return 1 << ((n - 1).bit_length() - 1)
 def set_sliding_window(sliding_window: int):
    global SLIDING_WINDOW
    SLIDING_WINDOW = sliding_window
@ -147,8 +159,10 @@ class FlashCausalLMBatch(Batch):
    # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences
    block_tables_tensor: torch.Tensor
    # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
-    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    slots: torch.Tensor
-    slots: Optional[torch.Tensor]
+    # list of length b + 1  containing the cumulative sequence slot lengths of the sequences in the batch
    # used for filtering
    cu_slots: torch.Tensor
    max_input_length: int
    max_current_length: int
@ -159,7 +173,7 @@ class FlashCausalLMBatch(Batch):
    prefilling_mask: List[bool]
    # Prefill metadata tensors to efficiently compute logprobs
-    # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
+    # tensor of length b + 1  containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
    cu_seqlen_prefill: Optional[torch.Tensor]
    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
    # as we only keep SLIDING_WINDOW values instead of the whole tensor
@ -257,6 +271,8 @@ class FlashCausalLMBatch(Batch):
        all_input_ids = []
        all_postfix_ids = []
        requests_idx_mapping = {}
        slots = []
        cu_slots = [0]
        next_token_chooser_parameters = []
        stopping_criterias = []
@ -268,7 +284,9 @@ class FlashCausalLMBatch(Batch):
        max_length = 0
        max_blocks = 0
        cu_blocks = [0]
        block_tables = []
        block_tables_ragged = []
        # Parse batch
        for i, (r, tokenized_input) in enumerate(
@ -341,10 +359,21 @@ class FlashCausalLMBatch(Batch):
                request_blocks = [
                    b for b in range(num_blocks, num_blocks + needed_blocks)
                ]
                request_slots = [
                    s
                    for b in request_blocks
                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
                ]
            else:
                request_blocks = r.blocks
                request_slots = r.slots
            block_tables.append(request_blocks)
            block_tables_ragged.extend(request_blocks)
            cu_blocks.append(len(block_tables_ragged))
            slots.extend(request_slots)
            cu_slots.append(len(slots))
            cache_lengths.append(cache_length)
            num_blocks += len(request_blocks)
@ -378,16 +407,34 @@ class FlashCausalLMBatch(Batch):
            top_n_tokens, device=device, dtype=torch.int64
        )
-        block_tables_tensor = torch.zeros(
+        block_tables_ragged = torch.tensor(
-            (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
+            block_tables_ragged, device=device, dtype=torch.int32
        )
        cu_blocks = torch.tensor(cu_blocks, device=device, dtype=torch.int64)
        block_tables_tensor = torch.empty(
            (len(block_tables), max_blocks),
            device=device,
            dtype=torch.int32,
        )
        # If the device supports Triton, we can use a fused kernel
        if has_triton():
            block_tables_to_padded(
                max_blocks, cu_blocks, block_tables_tensor, block_tables_ragged
            )
        else:
            for i, request_blocks in enumerate(block_tables):
-            block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)
+                block_tables_tensor[i, : len(request_blocks)] = torch.tensor(
-        block_tables_tensor = block_tables_tensor.to(device)
+                    request_blocks
                )
        prompt_lengths_tensor = torch.tensor(
            prompt_lengths, dtype=torch.int32, device=device
        )
        slots = torch.tensor(slots, dtype=torch.int64, device=device)
        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)
        return cls(
            batch_id=pb.id,
            requests=pb.requests,
@ -420,7 +467,8 @@ class FlashCausalLMBatch(Batch):
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=None,
-            slots=None,
+            slots=slots,
            cu_slots=cu_slots,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
@ -457,6 +505,7 @@ class FlashCausalLMBatch(Batch):
        # Used to index into tensors
        indices = []
        if not has_triton():
            # slots to keep after filtering
            slot_filtering_indices = torch.zeros(
                self.slots.shape[0], dtype=torch.bool, device=device
@ -477,6 +526,7 @@ class FlashCausalLMBatch(Batch):
        cache_lengths = []
        prefix_offsets = []
        read_offsets = []
        cu_slots = [0]
        prefilling_mask = []
        prefill_logprob_tokens = []
@ -487,8 +537,8 @@ class FlashCausalLMBatch(Batch):
        num_blocks = 0
        max_blocks = 0
-        # Cumulative length
+        max_slots = 0
-        cumulative_max_length = 0
+        cumulative_slot_tokens = 0
        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
@ -531,29 +581,27 @@ class FlashCausalLMBatch(Batch):
            num_blocks += len(request_block_table)
            block_tables.append(request_block_table)
            start_slot = self.cu_slots[idx]
            end_slot = self.cu_slots[idx + 1]
            slot_length = end_slot - start_slot
            if not has_triton():
                # Set slice
                slot_filtering_indices[start_slot:end_slot] = True
            cu_slots.append(cumulative_slot_tokens + slot_length)
            # Input ids if the request was part of a prefilling batch
            # If the batch was decoding we can index into the tensor directly later
            if self.prefilling:
                input_ids.append(self.input_ids[idx])
            else:
                # Copy to tensor (CPU)
-                slot_indices[i] = cumulative_max_length
+                slot_indices[i] = cumulative_slot_tokens + request_cache_length
                remaining_tokens = (
                    stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
                )
                # Set slice
                slot_filtering_indices[
                    self.slot_indices[idx] : self.slot_indices[idx]
                    + request_input_length
                    + remaining_tokens
                    - 1
                ] = True
                cumulative_max_length += request_input_length + remaining_tokens - 1
            cumulative_slot_tokens += slot_length
            max_blocks = max(max_blocks, len(request_block_table))
            max_slots = max(max_slots, slot_length)
        all_input_ids_tensor = self.all_input_ids_tensor[indices]
        block_tables_tensor = self.block_tables_tensor[indices]
@ -564,11 +612,22 @@ class FlashCausalLMBatch(Batch):
        )
        prompt_lengths_tensor = self.prompt_lengths_tensor[indices]
        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)
        if not has_triton():
            slots = self.slots[slot_filtering_indices]
        else:
            slots = self.slots.new_empty(cumulative_slot_tokens)
            gpu_cu_slots = cu_slots.to(device)
            slots_indexing_start = self.cu_slots.to(device)[indices]
            slots_filtering(
                max_slots, self.slots, slots, gpu_cu_slots, slots_indexing_start
            )
        if self.prefilling:
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids = None
            slot_indices = None
            slots = None
            cache_lengths_tensor = None
            input_lengths_tensor = None
            adapter_meta = None
@ -578,7 +637,6 @@ class FlashCausalLMBatch(Batch):
            position_ids = self.position_ids[indices]
            adapter_indices = self.adapter_meta.adapter_indices[indices]
            input_lengths_tensor = self.input_lengths_tensor[indices]
            slots = self.slots[slot_filtering_indices]
            cache_lengths_tensor = self.cache_lengths_tensor[indices]
            # Move to GPU now that we have the whole tensor
@ -607,6 +665,7 @@ class FlashCausalLMBatch(Batch):
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            slots=slots,
            cu_slots=cu_slots,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=self.prefilling,
@ -653,9 +712,7 @@ class FlashCausalLMBatch(Batch):
        for b in batches:
            total_batch_size += len(b)
            max_blocks = max(max_blocks, b.max_blocks)
-            # If `b` is prefilling and was just filtered, `b.slots` is None
+            total_slots += len(b.slots)
            # `total_slots` is not used if any of the batches is prefilling
            total_slots += len(b.slots) if not b.prefilling else 0
            num_blocks += b.num_blocks
            speculative_length = (
                b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
@ -675,11 +732,12 @@ class FlashCausalLMBatch(Batch):
            )
            prefilling = prefilling or b.prefilling
        slots = batches[0].slots.new_empty(total_slots)
        cu_slots = torch.zeros(total_batch_size + 1, dtype=torch.int64)
        if prefilling:
            input_ids = []
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids = None
            slots = None
            slot_indices = None
            cache_lengths_tensor = None
            input_lengths_tensor = None
@ -688,7 +746,6 @@ class FlashCausalLMBatch(Batch):
        else:
            input_ids = batches[0].input_ids.new_empty(total_batch_size)
            position_ids = batches[0].position_ids.new_empty(total_batch_size)
            slots = batches[0].slots.new_empty(total_slots)
            slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
            input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
                total_batch_size
@ -764,13 +821,16 @@ class FlashCausalLMBatch(Batch):
            ] = batch.block_tables_tensor[:, :max_blocks]
            prompt_lengths_tensor[start_index:end_index] = batch.prompt_lengths_tensor
            if not prefilling:
            slots_start_index = cumulative_slots
            slots_end_index = cumulative_slots + len(batch.slots)
            slots[slots_start_index:slots_end_index] = batch.slots
            cu_slots[start_index + 1 : end_index + 1] = (
                batch.cu_slots[1:] + cumulative_slots
            )
            if not prefilling:
                input_ids[start_index:end_index] = batch.input_ids
                position_ids[start_index:end_index] = batch.position_ids
                slots[slots_start_index:slots_end_index] = batch.slots
                slot_indices[start_index:end_index] = (
                    batch.slot_indices + cumulative_slots
                )
@ -792,9 +852,6 @@ class FlashCausalLMBatch(Batch):
                    batch.adapter_meta.adapter_segments,
                    batch.adapter_meta.segment_indices,
                )
                # Update
                cumulative_slots += len(batch.slots)
            else:
                if isinstance(batch.input_ids, torch.Tensor):
                    batch.input_ids = batch.input_ids.view(-1, 1).tolist()
@ -819,6 +876,7 @@ class FlashCausalLMBatch(Batch):
            top_n_tokens.extend(batch.top_n_tokens)
            # Update
            cumulative_slots += len(batch.slots)
            cumulative_batch_size += len(batch)
        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
@ -858,6 +916,7 @@ class FlashCausalLMBatch(Batch):
            cache_lengths=cache_lengths,
            cache_lengths_tensor=cache_lengths_tensor,
            slots=slots,
            cu_slots=cu_slots,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=prefilling,
@ -890,15 +949,50 @@ class FlashCausalLMBatch(Batch):
        # it simplifies everything
        assert self.speculative_ids is None
        device = self.block_tables_tensor.device
        if isinstance(self.input_ids, list):
            if len(self) > 1:
                input_ids = np.concatenate(self.input_ids, dtype=np.int64)
            else:
                input_ids = self.input_ids[0]
            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
        self.input_lengths_tensor = torch.tensor(
            self.input_lengths, dtype=torch.int32, device=device
        )
        self.cu_seqlen_prefill = torch.nn.functional.pad(
            torch.cumsum(self.input_lengths_tensor, dim=0), (1, 0)
        ).to(torch.int32)
        self.cache_lengths_tensor = torch.tensor(
            self.cache_lengths, dtype=torch.int32, device=device
        )
        # If the device supports Triton, we can use a fused kernel
        if has_triton():
            self.position_ids = torch.empty(
                len(self.input_ids), dtype=torch.int32, device=device
            )
            self.slot_indices = torch.empty(
                len(self.input_ids), dtype=torch.int64, device=device
            )
            cu_slots_gpu = self.cu_slots.to(device)
            prepare_position_slot_ids(
                self.max_input_length,
                self.cache_lengths_tensor,
                self.cu_seqlen_prefill,
                cu_slots_gpu,
                self.position_ids,
                self.slot_indices,
            )
        sliding_window = get_sliding_windows()
        position_ids = []
        cu_seqlen_prefill = [0]
        slot_indices = []
        prefill_cache_indices = []
        all_prefill_logprobs = True
        no_prefill_logprobs = True
        prefill_head_indices = []
        prefill_next_token_indices = []
        prefill_cu_outlens = [0]
        # Cumulative length
@ -906,7 +1000,6 @@ class FlashCausalLMBatch(Batch):
        cumulative_slot_tokens = 0
        prefill_out_cumulative_length = 0
        slots = []
        adapter_indices_list = []
        adapter_set = set()
@ -928,15 +1021,14 @@ class FlashCausalLMBatch(Batch):
            )
        ):
            next_chunk_length = input_length
            if not has_triton():
                # Position ids
                request_position_ids = torch.arange(
                    cache_length, cache_length + input_length, dtype=torch.int32
                )
                position_ids.append(request_position_ids)
            # Add cumulative lengths of all previous inputs
            cu_seqlen_prefill.append(cumulative_length + input_length)
                if not r.slots:
                    request_slots = [
                        s
@ -946,13 +1038,17 @@ class FlashCausalLMBatch(Batch):
                else:
                    request_slots = r.slots
            request_slots = request_slots[cache_length:]
                request_slot_indices = torch.arange(
-                cumulative_slot_tokens,
+                    cache_length + cumulative_slot_tokens,
-                cumulative_slot_tokens + input_length,
+                    cache_length + cumulative_slot_tokens + input_length,
                    dtype=torch.int64,
                )
                slot_indices.append(request_slot_indices)
                # Update
                cumulative_slot_tokens += len(request_slots)
            # Create tensor to slice into the kv tensor in prefill
            if sliding_window is not None:
                request_prefill_cache_indices = torch.arange(
@ -967,6 +1063,49 @@ class FlashCausalLMBatch(Batch):
            all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs
            no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs
            if prefill_logprobs:
                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
                prefill_out_cumulative_length += input_length
            else:
                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
                prefill_out_cumulative_length += 1
            if sliding_window is not None:
                prefill_cache_indices.append(request_prefill_cache_indices)
            ADAPTER_TO_INDEX = get_adapter_to_index()
            if ADAPTER_TO_INDEX:
                adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
                adapter_indices_list.append(
                    torch.full((next_chunk_length,), adapter_index)
                )
                adapter_set.add(adapter_index)
            # Update
            cumulative_length += next_chunk_length
        if not all_prefill_logprobs and not no_prefill_logprobs:
            prefill_head_indices = []
            prefill_next_token_indices = []
            # Cumulative length
            cumulative_length = 0
            prefill_out_cumulative_length = 0
            for i, (
                r,
                input_length,
                request_prefilling,
            ) in enumerate(
                zip(
                    self.requests,
                    self.input_lengths,
                    self.prefilling_mask,
                )
            ):
                # Prefill logprobs is ignored if the request is done prefilling
                prefill_logprobs = r.prefill_logprobs and request_prefilling
                if prefill_logprobs:
                    prefill_head_indices.append(
                        torch.arange(
@ -978,7 +1117,6 @@ class FlashCausalLMBatch(Batch):
                    prefill_next_token_indices.append(
                        prefill_out_cumulative_length + input_length - 1
                    )
                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
                    prefill_out_cumulative_length += input_length
                else:
                    prefill_head_indices.append(
@ -988,63 +1126,40 @@ class FlashCausalLMBatch(Batch):
                        )
                    )
                    prefill_next_token_indices.append(prefill_out_cumulative_length)
                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
                    prefill_out_cumulative_length += 1
            slots.extend(request_slots)
            slot_indices.append(request_slot_indices)
            if sliding_window is not None:
                prefill_cache_indices.append(request_prefill_cache_indices)
            ADAPTER_TO_INDEX = get_adapter_to_index()
            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
            adapter_indices_list.append(torch.full((next_chunk_length,), adapter_index))
            adapter_set.add(adapter_index)
                # Update
-            cumulative_length += next_chunk_length
+                cumulative_length += input_length
            cumulative_slot_tokens += len(request_slots)
        device = self.block_tables_tensor.device
        if isinstance(self.input_ids, list):
            if len(self) > 1:
                input_ids = np.concatenate(self.input_ids, dtype=np.int64)
            else:
                input_ids = self.input_ids[0]
            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
        if len(self) > 1:
            if position_ids:
                position_ids = torch.cat(position_ids)
            if slot_indices:
                slot_indices = torch.cat(slot_indices)
            if sliding_window is not None:
                prefill_cache_indices = torch.cat(prefill_cache_indices)
        else:
            if position_ids:
                position_ids = position_ids[0]
            if slot_indices:
                slot_indices = slot_indices[0]
            if sliding_window is not None:
                prefill_cache_indices = prefill_cache_indices[0]
-        self.prefill_cu_outlens = prefill_cu_outlens
+        if not has_triton():
        cu_seqlen_prefill = torch.tensor(
            cu_seqlen_prefill, device=device, dtype=torch.int32
        )
        self.cu_seqlen_prefill = cu_seqlen_prefill
            self.position_ids = position_ids.to(device)
            self.slot_indices = slot_indices.to(device)
        self.prefill_cu_outlens = prefill_cu_outlens
        self.prefill_cache_indices = (
            prefill_cache_indices.to(device) if sliding_window is not None else None
        )
        self.input_lengths_tensor = torch.tensor(
            self.input_lengths, dtype=torch.int32, device=device
        )
        if all_prefill_logprobs:
            prefill_head_indices = None
-            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
+            prefill_next_token_indices = self.cu_seqlen_prefill[1:] - 1
        elif no_prefill_logprobs:
-            prefill_head_indices = cu_seqlen_prefill[1:] - 1
+            prefill_head_indices = self.cu_seqlen_prefill[1:] - 1
            prefill_next_token_indices = None
        else:
            prefill_head_indices = torch.cat(prefill_head_indices).to(device)
@ -1054,17 +1169,21 @@ class FlashCausalLMBatch(Batch):
        self.prefill_head_indices = prefill_head_indices
        self.prefill_next_token_indices = prefill_next_token_indices
-        self.slots = torch.tensor(slots, dtype=torch.int64, device=device)
+
-        self.cache_lengths_tensor = torch.tensor(
+        if adapter_set:
            self.cache_lengths, dtype=torch.int32, device=device
        )
            adapter_indices = torch.cat(adapter_indices_list).to(
                dtype=torch.int64, device=device
            )
            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
        else:
            adapter_indices = torch.zeros_like(self.input_ids)
            adapter_segments = [0, len(adapter_indices)]
            adapter_segment_indices = [len(adapter_indices) - 1]
        adapter_segments = torch.tensor(
            adapter_segments, dtype=torch.int32, device=device
        )
        self.adapter_meta = AdapterBatchMetadata(
            adapter_indices=adapter_indices,
            adapter_set=adapter_set,
@ -1288,6 +1407,9 @@ class FlashCausalLM(Model):
                block_tables=block_tables,
                input_lengths=input_lengths,
                cache_lengths=cache_lengths,
                input_lengths_tensor=input_lengths_tensor,
                cache_lengths_tensor=cache_lengths_tensor,
                max_current_length=max_s,
            )
            from text_generation_server.layers.attention.flashinfer import (
                create_decode_state_cuda_graphs,
@ -1377,11 +1499,22 @@ class FlashCausalLM(Model):
                self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
        torch.cuda.synchronize()
-    def warmup(self, batch: FlashCausalLMBatch):
+    def warmup(
        self,
        batch: FlashCausalLMBatch,
        max_input_tokens: Optional[int],
        max_total_tokens: Optional[int],
    ):
        # The warmup batch is the biggest batch we could ever receive
        self.kv_cache = []
        empty_cache()
        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
        # Calculate the number of blocks that can be allocated with the free memory
        dtype_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size()
        cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
        try:
            self.init_kv_cache(
                batch.num_blocks,
@ -1393,10 +1526,11 @@ class FlashCausalLM(Model):
            )
            max_bt = batch.max_blocks
            max_s = max_bt * BLOCK_SIZE
            batch_num_blocks = batch.num_blocks
            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
                torch.cuda.tunable.tuning_enable(False)
-            _, batch, _ = self.generate_token(batch)
+            _, _batch, _ = self.generate_token(batch)
        except torch.cuda.OutOfMemoryError as e:
            raise RuntimeError(
                f"Not enough memory to handle {batch.to_pb().current_tokens} prefill tokens. "
@ -1405,14 +1539,7 @@ class FlashCausalLM(Model):
        synchronize(self.device)
        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
        # Calculate the number of blocks that can be allocated with the free memory
        dtype_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size()
        cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
        batch_num_blocks = batch.num_blocks if batch is not None else 0
        num_blocks = (
            # Leave 5% for some wiggle room
@ -1422,8 +1549,27 @@ class FlashCausalLM(Model):
        )
        log_master(logger.info, f"KV-cache blocks: {num_blocks}, size: {BLOCK_SIZE}")
        if max_total_tokens is None:
            if get_support_chunking():
                model_max_length = self.tokenizer.model_max_length
                max_input_tokens = (
                    min((num_blocks * BLOCK_SIZE - 1), model_max_length)
                    if max_input_tokens is None
                    else max_input_tokens
                )
                max_total_tokens = num_blocks * BLOCK_SIZE
-        del batch
+            else:
                max_total_tokens = sum(batch.cache_lengths)
                max_input_tokens = (
                    max_total_tokens - 1
                    if max_input_tokens is None
                    else max_input_tokens
                )
        del _batch, batch
        self.kv_cache = []
        empty_cache()
        self.init_kv_cache(
            num_blocks,
@ -1505,7 +1651,9 @@ class FlashCausalLM(Model):
                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
            )
-        return int(num_blocks * BLOCK_SIZE)
+        assert max_input_tokens is not None
        assert max_total_tokens is not None
        return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens
    def tunableop_warmup(self, seqlen: int):
        input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
@ -1621,6 +1769,9 @@ class FlashCausalLM(Model):
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
                    input_lengths_tensor=batch.input_lengths_tensor,
                    cache_lengths_tensor=batch.cache_lengths_tensor,
                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
@ -1661,6 +1812,9 @@ class FlashCausalLM(Model):
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
                input_lengths_tensor=batch.input_lengths_tensor,
                cache_lengths_tensor=batch.cache_lengths_tensor,
                max_current_length=batch.max_current_length,
            )
            # assert block_tables.shape[0] >= slots.shape[0]
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
@ -1756,7 +1910,6 @@ class FlashCausalLM(Model):
        else:
            prefill_logprobs = None
            next_token_logits = out
            next_adapter_indices = batch.adapter_meta.adapter_indices
        finished_prefilling = True
        next_chunk_lengths = []
@ -1827,13 +1980,12 @@ class FlashCausalLM(Model):
        # Since we are done prefilling, all the tensors that were concatenating values for all the requests
        # instantly become of shape [BATCH_SIZE]
        if prefill and finished_prefilling:
-            next_position_ids = batch.position_ids.new_empty(len(batch))
+            indices = batch.cu_seqlen_prefill[1:] - 1
-            batch.slot_indices = batch.slot_indices[batch.cu_seqlen_prefill[1:] - 1]
+            batch.position_ids = batch.position_ids[indices]
-            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
+            batch.slot_indices = batch.slot_indices[indices]
-                len(batch)
+            batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
-            )
+                indices
-        elif not prefill:
+            ]
            next_position_ids = batch.position_ids
        # Zipped iterator
        iterator = zip(
@ -1852,8 +2004,10 @@ class FlashCausalLM(Model):
        # It is faster if we delay this sync for the maximum amount of time
        # For each member of the batch
        index = 0
        # Cumulative length
        cu_accepted_ids = torch.nn.functional.pad(
            torch.cumsum(accepted_ids, dim=0), (1, 0)
        )
        cumulative_length = 0
        for i, (
            request,
@ -1865,21 +2019,6 @@ class FlashCausalLM(Model):
            request_was_prefilling,
            request_is_prefilling,
        ) in enumerate(iterator):
            if prefill and finished_prefilling:
                # Indexing metadata
                _start_index = cumulative_length
                end_index = cumulative_length + input_length
                # Initialize position_ids
                # In decode, we do not need this as we can just increment position ids
                next_position_ids[i] = batch.position_ids[end_index - 1]
                # Initialize adapter indices
                # In decode, we only have one token per row in the batch, so grab last index
                next_adapter_indices[i] = batch.adapter_meta.adapter_indices[
                    end_index - 1
                ]
            # Used to gather prefill logprobs
            # Copy batch.all_input_ids_tensor to prefill_token_indices
            if request.prefill_logprobs and request_was_prefilling:
@ -1898,25 +2037,39 @@ class FlashCausalLM(Model):
                    # Set prefill_tokens_indices to the correct slice
                    prefill_tokens_indices = ids
-            if not request_is_prefilling:
+            # If the device does not support triton, we copy one by one
            if not request_is_prefilling and not has_triton():
                # Only save tokens if we are done prefilling for this request
-                for j in range(n_accepted_ids):
+                batch.all_input_ids_tensor[
-                    batch.all_input_ids_tensor[i, cache_length + input_length + j] = (
+                    i,
-                        next_input_ids[index + j]
+                    batch.cache_lengths_tensor[i]
-                    )
+                    + batch.input_lengths[i] : batch.cache_lengths_tensor[i]
-            index += n_accepted_ids
+                    + batch.input_lengths[i]
                    + accepted_ids[i],
                ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
            cumulative_length += input_length
        # If the device support triton, we can use a fused kernel
        if has_triton():
            copy_next_input_ids_inplace(
                speculate + 1,
                batch.all_input_ids_tensor,
                batch.cache_lengths_tensor,
                batch.input_lengths_tensor,
                batch.prompt_lengths_tensor,
                next_input_ids,
                cu_accepted_ids,
            )
        # Update values
        # These values can be updated without a GPU -> CPU sync
        if not prefill or (prefill and finished_prefilling):
-            batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
+            batch.input_ids = next_input_ids[cu_accepted_ids[1:] - 1]
            batch.speculative_ids = speculative_ids
-            batch.position_ids = next_position_ids + accepted_ids
+            batch.position_ids += accepted_ids
-            batch.cache_lengths_tensor += batch.input_lengths_tensor
+            batch.cache_lengths_tensor += batch.input_lengths_tensor + accepted_ids - 1
-            batch.input_lengths_tensor = accepted_ids.to(dtype=torch.int32)
+            batch.input_lengths_tensor = torch.ones_like(batch.input_lengths_tensor)
            batch.slot_indices += accepted_ids
            batch.adapter_meta.adapter_indices = next_adapter_indices
        if prefill and prefill_logprobs:
            # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
@ -2093,8 +2246,10 @@ class FlashCausalLM(Model):
                # processing
                stopped = False
                new_input_length = next_chunk_lengths[i]
                new_cache_length = cache_length + input_length
            else:
-                new_input_length = n_accepted_ids
+                new_input_length = 1
                new_cache_length = cache_length + input_length + n_accepted_ids - 1
                # Append next token to all tokens
                next_token_texts = []
                left = 0
@ -2206,12 +2361,10 @@ class FlashCausalLM(Model):
            # Update values
            index += n_accepted_ids
-            current_cache_length = cache_length + input_length
+            batch.cache_lengths[i] = new_cache_length
-            batch.cache_lengths[i] = current_cache_length
+            batch.max_input_length = max(batch.max_input_length, new_input_length)
-            current_input_length = new_input_length
+            batch.input_lengths[i] = new_input_length
-            batch.max_input_length = max(batch.max_input_length, current_input_length)
+            current_length = new_cache_length + new_input_length
            batch.input_lengths[i] = current_input_length
            current_length = current_cache_length + current_input_length
            batch.max_current_length = max(batch.max_current_length, current_length)
            batch.prefix_offsets[i] = prefix_offset
@ -2258,11 +2411,6 @@ class FlashCausalLM(Model):
                state=(
                    state if state is not None else self.prefill_with_paged_kv_state
                ),
                # block_tables=block_tables_to_ragged(
                #     block_tables=block_tables,
                #     input_lengths=input_lengths,
                #     cache_lengths=cache_lengths,
                # ),
                block_tables=block_tables,
                cu_seqlens=cu_seqlen_prefill,
                input_lengths=input_lengths_tensor + cache_lengths_tensor,
@ -2287,23 +2435,3 @@ class FlashCausalLM(Model):
                dtype=self.dtype,
                window_left=self.sliding_window,
            )
 def block_tables_to_ragged(
    *, block_tables: torch.Tensor, input_lengths: List[int], cache_lengths: List[int]
 ) -> torch.Tensor:
    """Convert block table to ragged format compatible with FlashInfer."""
    assert len(input_lengths) == len(cache_lengths)
    total_len = sum(input_lengths) + sum(cache_lengths)
    block_tables_ragged = torch.empty(
        total_len, dtype=torch.int32, device=block_tables.device
    )
    offset = 0
    for i, (input_length, cache_length) in enumerate(zip(input_lengths, cache_lengths)):
        seq_len = cache_length + input_length
        block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
        offset += seq_len
    return block_tables_ragged
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@ -1,7 +1,7 @@
 import torch
 import torch.distributed
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
-from typing import Optional
+from typing import Optional, Union
 from text_generation_server.models.custom_modeling.mamba_modeling import (
    MambaConfig,
 )
@ -475,7 +475,9 @@ class Mamba(Model):
    def batch_type(self) -> Type[MambaBatch]:
        return MambaBatch
-    def warmup(self, batch) -> Optional[int]:
+    def warmup(
        self, batch, max_input_tokens: Optional[int], max_total_tokens: Optional[int]
    ) -> Union[Optional[int], Optional[int], Optional[int]]:
        # TODO: implement warmup for Mamba if needed
        if CUDA_GRAPHS:
            if self.speculate is None or self.speculate == 0:
@ -489,7 +491,12 @@ class Mamba(Model):
        else:
            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
-        return None
+        if max_total_tokens is None:
            max_total_tokens = min(self.tokenizer.model_max_length, 4096)
        if max_input_tokens is None:
            max_input_tokens = max_total_tokens - 1
        return None, max_input_tokens, max_total_tokens
    def cuda_graph_warmup(self, batch_size: int):
        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
--- a/server/text_generation_server/models/metadata_kernels.py
+++ b/server/text_generation_server/models/metadata_kernels.py
@ -0,0 +1,347 @@
 import torch
 import triton
 import triton.language as tl
 from loguru import logger
 from typing import List, Optional
 from torch.utils._triton import has_triton as has_triton_torch
 from text_generation_server.utils.import_utils import (
    SYSTEM,
 )
 from text_generation_server.utils.log import log_master
 _HAS_TRITON: Optional[bool] = None
 def has_triton():
    global _HAS_TRITON
    if _HAS_TRITON is None:
        # FIXME: it seems that has_triton_torch is bugged on RocM
        #        For now, only accept cuda
        _HAS_TRITON = has_triton_torch() if SYSTEM == "cuda" else False
        if _HAS_TRITON:
            log_master(logger.info, "Using optimized Triton indexing kernels.")
    return _HAS_TRITON
 def block_tables_to_padded(
    max_blocks: int,
    cu_seqlen: torch.Tensor,
    block_tables: torch.Tensor,
    block_tables_ragged: torch.Tensor,
 ):
    def grid(meta):
        return (
            triton.cdiv(max_blocks, meta["BLOCK_SIZE"]),
            len(block_tables),
        )
    triton_block_tables_to_padded[grid](
        cu_seqlen,
        block_tables,
        block_tables_ragged,
        block_tables.shape[1],
        BLOCK_SIZE=256,
    )
 def block_tables_to_ragged(
    *,
    block_tables: torch.Tensor,
    input_lengths: List[int],
    cache_lengths: List[int],
    input_lengths_tensor: torch.Tensor,
    cache_lengths_tensor: torch.Tensor,
    max_current_length: int
 ) -> torch.Tensor:
    """Convert block table to ragged format compatible with FlashInfer."""
    assert len(input_lengths) == len(cache_lengths)
    total_len = sum(input_lengths) + sum(cache_lengths)
    block_tables_ragged = torch.empty(
        total_len, dtype=torch.int32, device=block_tables.device
    )
    if has_triton():
        cu_seqlen = torch.nn.functional.pad(
            torch.cumsum(input_lengths_tensor + cache_lengths_tensor, dim=0), (1, 0)
        )
        def grid(meta):
            return (
                triton.cdiv(max_current_length, meta["BLOCK_SIZE"]),
                len(cache_lengths),
            )
        triton_block_tables_to_ragged[grid](
            cu_seqlen,
            block_tables,
            block_tables_ragged,
            block_tables.shape[1],
            BLOCK_SIZE=256,
        )
    else:
        offset = 0
        for i, (input_length, cache_length) in enumerate(
            zip(input_lengths, cache_lengths)
        ):
            seq_len = cache_length + input_length
            block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
            offset += seq_len
    return block_tables_ragged
 def copy_next_input_ids_inplace(
    max_next_input_ids: int,
    all_input_ids: torch.Tensor,
    cache_lengths: torch.Tensor,
    input_lengths: torch.Tensor,
    prompt_lengths: torch.Tensor,
    next_input_ids: torch.Tensor,
    cu_accepted_ids: torch.Tensor,
 ):
    def grid(meta):
        return (
            triton.cdiv(max_next_input_ids, meta["BLOCK_SIZE"]),
            len(all_input_ids),
        )
    triton_copy_next_input_ids_inplace[grid](
        all_input_ids,
        cache_lengths,
        input_lengths,
        prompt_lengths,
        next_input_ids,
        cu_accepted_ids,
        all_input_ids.shape[1],
        BLOCK_SIZE=16,
    )
 def prepare_position_slot_ids(
    max_input_length: int,
    cache_lengths: torch.Tensor,
    cu_seqlen: torch.Tensor,
    cu_slots: torch.Tensor,
    position_ids: torch.Tensor,
    slot_indices: torch.Tensor,
 ):
    def grid(meta):
        return (
            triton.cdiv(max_input_length, meta["BLOCK_SIZE"]),
            len(cache_lengths),
        )
    triton_prepare_position_slot_ids[grid](
        cache_lengths, cu_seqlen, cu_slots, position_ids, slot_indices, BLOCK_SIZE=256
    )
 def slots_filtering(
    max_slots: int,
    slots: torch.Tensor,
    filtered_slots: torch.Tensor,
    cu_slots: torch.Tensor,
    slots_start: torch.Tensor,
 ):
    def grid(meta):
        return (
            triton.cdiv(max_slots, meta["BLOCK_SIZE"]),
            len(slots_start),
        )
    triton_slots_filtering[grid](
        slots, filtered_slots, slots_start, cu_slots, BLOCK_SIZE=256
    )
@triton.jit
 def triton_slots_filtering(
    # Inputs
    slots_ptr,
    filtered_slots_ptr,
    slots_start_ptr,
    cu_slots_ptr,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
 ):
    # Position in block_tables_ragged.numel() / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)
    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
    filter_start = tl.load(slots_start_ptr + bid)
    slot_start = tl.load(cu_slots_ptr + bid)
    slot_end = tl.load(cu_slots_ptr + bid + 1)
    mask = (slot_start + block_arange) < slot_end
    slots = tl.load(slots_ptr + filter_start + block_arange, mask=mask)
    tl.store(filtered_slots_ptr + slot_start + block_arange, slots, mask=mask)
@triton.jit
 def triton_block_tables_to_padded(
    # Inputs
    cu_seqlen_ptr,
    # Outputs
    block_tables_ptr,
    block_tables_ragged_ptr,
    # Stride
    stride_block_tables,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
 ):
    # Position in block_tables_ragged.numel() / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)
    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
    seq_start = tl.load(cu_seqlen_ptr + bid)
    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
    mask = (seq_start + block_arange) < seq_end
    blocks = tl.load(block_tables_ragged_ptr + seq_start + block_arange, mask=mask)
    tl.store(
        block_tables_ptr + bid * stride_block_tables + block_arange, blocks, mask=mask
    )
@triton.jit
 def triton_block_tables_to_ragged(
    # Inputs
    cu_seqlen_ptr,
    # Outputs
    block_tables_ptr,
    block_tables_ragged_ptr,
    # Stride
    stride_block_tables,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
 ):
    # Position in block_tables_ragged.numel() / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)
    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
    seq_start = tl.load(cu_seqlen_ptr + bid)
    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
    mask = (seq_start + block_arange) < seq_end
    blocks = tl.load(
        block_tables_ptr + bid * stride_block_tables + block_arange, mask=mask
    )
    tl.store(block_tables_ragged_ptr + seq_start + block_arange, blocks, mask=mask)
@triton.jit
 def triton_copy_next_input_ids_inplace(
    # Inputs
    all_input_ids_ptr,
    cache_lengths_ptr,
    input_lengths_ptr,
    prompt_lengths_ptr,
    next_input_ids_ptr,
    cu_accepted_ids_ptr,
    # Stride
    stride_all_input_ids,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
 ):
    # Position in max_accepted_ids / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)
    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
    # Used for correctly indexing in all_input_ids
    cache_length = tl.load(cache_lengths_ptr + bid)
    input_length = tl.load(input_lengths_ptr + bid)
    prompt_length = tl.load(prompt_lengths_ptr + bid)
    # Start/End of next_input_ids for this request
    next_input_ids_start = tl.load(cu_accepted_ids_ptr + bid)
    next_input_ids_end = tl.load(cu_accepted_ids_ptr + bid + 1)
    # Mask values out of range
    mask = (next_input_ids_start + block_arange) < next_input_ids_end
    # Mask values for request still prefilling
    decode_mask = (cache_length + input_length + block_arange) >= prompt_length
    mask = mask & decode_mask
    # Load this request next input ids
    next_input_ids = tl.load(
        next_input_ids_ptr + next_input_ids_start + block_arange, mask=mask
    )
    # Store in all_input_ids, since it is a 2D tensor, apply stride * bid
    tl.store(
        all_input_ids_ptr
        + stride_all_input_ids * bid
        + cache_length
        + input_length
        + block_arange,
        next_input_ids,
        mask=mask,
    )
@triton.jit
 def triton_prepare_position_slot_ids(
    # Inputs
    cache_lengths_ptr,
    cu_seqlen_ptr,
    cu_slots_ptr,
    # Outputs
    position_ids_ptr,
    slot_indices_ptr,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
 ):
    # Position in max_input_length / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)
    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
    cache_length = tl.load(cache_lengths_ptr + bid)
    seq_start = tl.load(cu_seqlen_ptr + bid)
    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
    slot_start = tl.load(cu_slots_ptr + bid)
    mask = (seq_start + block_arange) < seq_end
    tl.store(
        position_ids_ptr + seq_start + block_arange,
        cache_length + block_arange,
        mask=mask,
    )
    tl.store(
        slot_indices_ptr + seq_start + block_arange,
        slot_start + cache_length + block_arange,
        mask=mask,
    )
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@ -14,11 +14,9 @@ from transformers import (
 from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
    block_tables_to_ragged,
 )
 from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.models.metadata_kernels import block_tables_to_ragged
 tracer = trace.get_tracer(__name__)
@ -283,6 +281,9 @@ class MllamaCausalLM(VlmCausalLM):
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
                    input_lengths_tensor=batch.input_lengths_tensor,
                    cache_lengths_tensor=batch.cache_lengths_tensor,
                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
@ -338,6 +339,9 @@ class MllamaCausalLM(VlmCausalLM):
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
                input_lengths_tensor=batch.input_lengths_tensor,
                cache_lengths_tensor=batch.cache_lengths_tensor,
                max_current_length=batch.max_current_length,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -128,9 +128,17 @@ class Model(ABC):
    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
        raise NotImplementedError
-    def warmup(self, batch: B) -> Optional[int]:
+    def warmup(
        self, batch: B, max_input_tokens: Optional[int], max_total_tokens: Optional[int]
    ) -> Tuple[Optional[int], int, int]:
        self.generate_token(batch)
-        return None
+        total = sum(len(i) for i in batch.input_ids)
        if max_total_tokens is None:
            max_total_tokens = total
        if max_input_tokens is None:
            max_input_tokens = max_total_tokens - 1
        return None, max_input_tokens, max_total_tokens
    def decode_token(
        self,
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -11,12 +11,12 @@ from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
    FlashCausalLMBatch,
    FlashCausalLM,
    block_tables_to_ragged,
 )
 from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
 from text_generation_server.utils.log import log_master
 from transformers import AutoProcessor
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.models.metadata_kernels import block_tables_to_ragged
 tracer = trace.get_tracer(__name__)
@ -363,6 +363,9 @@ class VlmCausalLM(FlashCausalLM):
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
                    input_lengths_tensor=batch.input_lengths_tensor,
                    cache_lengths_tensor=batch.cache_lengths_tensor,
                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
@ -411,6 +414,9 @@ class VlmCausalLM(FlashCausalLM):
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
                input_lengths_tensor=batch.input_lengths_tensor,
                cache_lengths_tensor=batch.cache_lengths_tensor,
                max_current_length=batch.max_current_length,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -132,10 +132,22 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )
-        max_supported_total_tokens = self.model.warmup(batch)
+
        # Override default values with None for clearer semantics.
        max_input_tokens = (
            request.max_input_tokens if request.HasField("max_input_tokens") else None
        )
        max_total_tokens = (
            request.max_total_tokens if request.HasField("max_total_tokens") else None
        )
        max_supported_total_tokens, max_input_tokens, max_total_tokens = (
            self.model.warmup(batch, max_input_tokens, max_total_tokens)
        )
        return generate_pb2.WarmupResponse(
-            max_supported_total_tokens=max_supported_total_tokens
+            max_supported_total_tokens=max_supported_total_tokens,
            max_input_tokens=max_input_tokens,
            max_total_tokens=max_total_tokens,
        )
    async def Prefill(self, request, context):